usr/src/uts/sun4v/cpu/niagara_copy.S

root/usr/src/uts/sun4v/cpu/niagara_copy.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */


#include <sys/param.h>
#include <sys/errno.h>
#include <sys/asm_linkage.h>
#include <sys/vtrace.h>
#include <sys/machthread.h>
#include <sys/clock.h>
#include <sys/asi.h>
#include <sys/fsr.h>
#include <sys/privregs.h>
#include <sys/machasi.h>
#include <sys/niagaraasi.h>

#include "assym.h"


/*
 * Pseudo-code to aid in understanding the control flow of the
 * bcopy/kcopy routine.
 *
 *      ! WARNING : <Register usage convention>
 *      ! In kcopy() the %o5, holds previous error handler and a flag
 *      ! LOFAULT_SET (low bits). The %o5 is null in bcopy().
 *      ! The %o5 is not available for any other use.
 *
 * On entry:
 *      ! Determine whether to use the FP register version or the
 *      ! the leaf routine version depending on the size of the copy.
 *      ! Set up error handling accordingly.
 *      ! The transition point depends on FP_COPY
 *      ! For both versions %o5 is reserved
 *
 * kcopy():
 *      if(length > FP_COPY)
 *              go to regular_kcopy
 *
 *      ! Setup_leaf_rtn_error_handler
 *      %o5 = curthread->t_lofault;             ! save existing handler in %o5
 *      %o5 |= LOFAULT_SET;                     ! ORed with LOFAULT_SET flag
 *      curthread->t_lofault = .sm_copyerr;
 *      goto small_bcopy();
 *
 * regular_kcopy:
 *      save_registers()
 *      %o5 = curthread->t_lofault;             ! save existing handler in %o5
 *      %o5 |= LOFAULT_SET;                     ! ORed with LOFAULT_SET flag
 *      curthread->t_lofault = .copyerr;
 *      goto do_copy();
 *
 * bcopy():
 *      if(length > FP_COPY)
 *              go to regular_bcopy
 *
 *      ! Setup_leaf_rtn_error_handler
 *      %o5 = curthread->t_lofault;             ! save existing handler in %o5
 *      curthread->t_lofault = .sm_copyerr;
 *      goto small_bcopy();
 *
 * regular_bcopy:
 *      %o5 = curthread->t_lofault;             ! save existing handler in %o5
 *      curthread->t_lofault = .copyerr;
 *      goto do_copy();
 *
 * small_bcopy:
 *      ! handle copies smaller than FP_COPY
 *      restore t_lofault handler
 *      exit
 *
 * do_copy:
 *      ! handle copies larger than FP_COPY
 *      save fp_regs
 *      blockcopy;
 *      restore fp_regs
 *      restore t_lofault handler if came from kcopy();
 *
 *
 * In leaf lofault handler:
 *      curthread->t_lofault = (%o5 & ~LOFAULT_SET);    ! restore old t_lofault
 *      return (errno)
 *
 * In lofault handler:
 *      curthread->t_lofault = (%o5 & ~LOFAULT_SET);    ! restore old t_lofault
 *      restore fp_regs
 *      return (errno)
 *
 *
 *
 * For all of bcopy/copyin/copyout the copy logic is specialized according
 * to how the src and dst is aligned and how much data needs to be moved.
 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
 *
 * N2/RF Flow :
 *
 * if (count < FP_COPY) {  (584 bytes)
 *   set small fault handler (no register window save/restore)
 *   if count < SHORTCOPY  (7 bytes)
 *      copy bytes; go to short_exit
 *   else
 *   determine dst alignment, move minimum bytes/halfwords to
 *   get dst aligned on long word boundary
 *     if( src is on long word boundary ) {
 * medlong:                                        src/dst aligned on 8 bytes
 *       copy with ldx/stx in 4-way unrolled loop;
 *       copy final 0-31 bytes; go to short_exit
 *     } else {                                 src/dst not aligned on 8 bytes
 *     if src is word aligned, ld/st words in 32-byte chunks
 *     if src is half word aligned, ld half, ld word, ld half; pack
 *              into long word, store long words in 32-byte chunks
 *     if src is byte aligned, ld byte,half,word parts;  pack into long
 *         word, store long words in 32-byte chunks
 *     move final 0-31 bytes according to src alignment;  go to short_exit
 * short_exit:
 *     restore trap handler if needed, retl
 * else {                                          More than FP_COPY bytes
 *     set fault handler
 *     disable kernel preemption
 *     save registers, save FP registers if in use
 *     move bytes to align destination register on long word boundary
 *     if(src is on long word boundary) {          src/dst aligned on 8 bytes
 *       align dst on 64 byte boundary;  use 8-way test for each of 8 possible
 *       src alignments relative to a 64 byte boundary to select the
 *       16-way unrolled loop (128 bytes) to use for
 *       block load, fmovd, block-init-store, block-store, fmovd operations
 *       then go to remain_stuff.
 * remain_stuff: move remaining bytes. go to long_exit
 *     } else {
 *       setup alignaddr for faligndata instructions
 *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
 *       src alignments to nearest long word relative to 64 byte boundary to
 *       select the 8-way unrolled loop (64 bytes) to use for
 *       block load, falign, fmovd, block-store loop
 *       (only use block-init-store when src/dst on 8 byte boundaries.)
 *       goto unalign_done.
 * unalign_done:
 *       move remaining bytes for unaligned cases. go to long_exit
 * long_exit:
 *       restore %gsr, FP regs (either from stack or set to zero),
 *       restore trap handler, check for kernel preemption request,
 *       handle if needed, ret.
 * }
 *
 * Other platforms include hw_bcopy_limit_[1248] to control the exact
 * point where the FP register code is used. On those platforms, the
 * FP register code did not leave data in L2 cache, potentially affecting
 * performance more than the gain/loss from the algorithm difference.
 * For N2/RF, block store places data in the L2 cache, so use or non-use
 * of the FP registers has no effect on L2 cache behavior.
 * The cost for testing hw_bcopy_limit_* according to different
 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
 * were not used. That cost was judged too high relative to the benefits,
 * so the hw_bcopy_limit option is omitted from this code.
 */

/*
 * Less then or equal this number of bytes we will always copy byte-for-byte
 */
#define SMALL_LIMIT     7

/*
 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
 * handler was set
 */
#define LOFAULT_SET 2

/*
 * This define is to align data for the unaligned source cases.
 * The data1, data2 and data3 is merged into data1 and data2.
 * The data3 is preserved for next merge.
 */
#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)    \
        sllx    data1, lshift, data1                            ;\
        srlx    data2, rshift, tmp                              ;\
        or      data1, tmp, data1                               ;\
        sllx    data2, lshift, data2                            ;\
        srlx    data3, rshift, tmp                              ;\
        or      data2, tmp, data2
/*
 * This macro is to align the data. Basically it merges
 * data1 and data2 to form double word.
 */
#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)        \
        sllx    data1, lshift, data1                            ;\
        srlx    data2, rshift, tmp                              ;\
        or      data1, tmp, data1

#if !defined(NIAGARA_IMPL)
/*
 * Flags set in the lower bits of the t_lofault address:
 * FPUSED_FLAG: The FP registers were in use and must be restored
 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
 * COPY_FLAGS: Both of the above
 *
 * Other flags:
 * KPREEMPT_FLAG: kpreempt needs to be called
 */
#define FPUSED_FLAG     1
#define LOFAULT_SET     2
#define COPY_FLAGS      (FPUSED_FLAG | LOFAULT_SET)
#define KPREEMPT_FLAG   4

#define ALIGN_OFF_1_7                   \
        faligndata %d0, %d2, %d48       ;\
        faligndata %d2, %d4, %d50       ;\
        faligndata %d4, %d6, %d52       ;\
        faligndata %d6, %d8, %d54       ;\
        faligndata %d8, %d10, %d56      ;\
        faligndata %d10, %d12, %d58     ;\
        faligndata %d12, %d14, %d60     ;\
        faligndata %d14, %d16, %d62

#define ALIGN_OFF_8_15                  \
        faligndata %d2, %d4, %d48       ;\
        faligndata %d4, %d6, %d50       ;\
        faligndata %d6, %d8, %d52       ;\
        faligndata %d8, %d10, %d54      ;\
        faligndata %d10, %d12, %d56     ;\
        faligndata %d12, %d14, %d58     ;\
        faligndata %d14, %d16, %d60     ;\
        faligndata %d16, %d18, %d62

#define ALIGN_OFF_16_23                 \
        faligndata %d4, %d6, %d48       ;\
        faligndata %d6, %d8, %d50       ;\
        faligndata %d8, %d10, %d52      ;\
        faligndata %d10, %d12, %d54     ;\
        faligndata %d12, %d14, %d56     ;\
        faligndata %d14, %d16, %d58     ;\
        faligndata %d16, %d18, %d60     ;\
        faligndata %d18, %d20, %d62

#define ALIGN_OFF_24_31                 \
        faligndata %d6, %d8, %d48       ;\
        faligndata %d8, %d10, %d50      ;\
        faligndata %d10, %d12, %d52     ;\
        faligndata %d12, %d14, %d54     ;\
        faligndata %d14, %d16, %d56     ;\
        faligndata %d16, %d18, %d58     ;\
        faligndata %d18, %d20, %d60     ;\
        faligndata %d20, %d22, %d62

#define ALIGN_OFF_32_39                 \
        faligndata %d8, %d10, %d48      ;\
        faligndata %d10, %d12, %d50     ;\
        faligndata %d12, %d14, %d52     ;\
        faligndata %d14, %d16, %d54     ;\
        faligndata %d16, %d18, %d56     ;\
        faligndata %d18, %d20, %d58     ;\
        faligndata %d20, %d22, %d60     ;\
        faligndata %d22, %d24, %d62

#define ALIGN_OFF_40_47                 \
        faligndata %d10, %d12, %d48     ;\
        faligndata %d12, %d14, %d50     ;\
        faligndata %d14, %d16, %d52     ;\
        faligndata %d16, %d18, %d54     ;\
        faligndata %d18, %d20, %d56     ;\
        faligndata %d20, %d22, %d58     ;\
        faligndata %d22, %d24, %d60     ;\
        faligndata %d24, %d26, %d62

#define ALIGN_OFF_48_55                 \
        faligndata %d12, %d14, %d48     ;\
        faligndata %d14, %d16, %d50     ;\
        faligndata %d16, %d18, %d52     ;\
        faligndata %d18, %d20, %d54     ;\
        faligndata %d20, %d22, %d56     ;\
        faligndata %d22, %d24, %d58     ;\
        faligndata %d24, %d26, %d60     ;\
        faligndata %d26, %d28, %d62

#define ALIGN_OFF_56_63                 \
        faligndata %d14, %d16, %d48     ;\
        faligndata %d16, %d18, %d50     ;\
        faligndata %d18, %d20, %d52     ;\
        faligndata %d20, %d22, %d54     ;\
        faligndata %d22, %d24, %d56     ;\
        faligndata %d24, %d26, %d58     ;\
        faligndata %d26, %d28, %d60     ;\
        faligndata %d28, %d30, %d62

/*
 * FP_COPY indicates the minimum number of bytes needed
 * to justify using FP/VIS-accelerated memory operations.
 * The FPBLK code assumes a minimum number of bytes are available
 * to be moved on entry.  Check that code carefully before
 * reducing FP_COPY below 256.
 */
#define FP_COPY                 584
#define SHORTCOPY               7
#define ASI_STBI_P              ASI_BLK_INIT_ST_QUAD_LDD_P
#define ASI_STBI_AIUS           ASI_BLK_INIT_QUAD_LDD_AIUS
#define CACHE_LINE              64
#define VIS_BLOCKSIZE           64

/*
 * Size of stack frame in order to accomodate a 64-byte aligned
 * floating-point register save area and 2 64-bit temp locations.
 * All copy functions use three quadrants of fp registers; to assure a
 * block-aligned three block buffer in which to save we must reserve
 * four blocks on stack.
 *
 *    _______________________________________ <-- %fp + STACK_BIAS
 *    | We may need to preserve 3 quadrants |
 *    | of fp regs, but since we do so with |
 *    | BST/BLD we need room in which to    |
 *    | align to VIS_BLOCKSIZE bytes.  So   |
 *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
 *    |-------------------------------------|
 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
 *    |-------------------------------------|
 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
 *    ---------------------------------------
 */
#define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
#define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 4)
#define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 3) + 1)
#define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
#define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)

/*
 * In FP copies if we do not have preserved data to restore over
 * the fp regs we used then we must zero those regs to avoid
 * exposing portions of the data to later threads (data security).
 */
#define FZERO                           \
        fzero   %f0                     ;\
        fzero   %f2                     ;\
        faddd   %f0, %f2, %f4           ;\
        fmuld   %f0, %f2, %f6           ;\
        faddd   %f0, %f2, %f8           ;\
        fmuld   %f0, %f2, %f10          ;\
        faddd   %f0, %f2, %f12          ;\
        fmuld   %f0, %f2, %f14          ;\
        faddd   %f0, %f2, %f16          ;\
        fmuld   %f0, %f2, %f18          ;\
        faddd   %f0, %f2, %f20          ;\
        fmuld   %f0, %f2, %f22          ;\
        faddd   %f0, %f2, %f24          ;\
        fmuld   %f0, %f2, %f26          ;\
        faddd   %f0, %f2, %f28          ;\
        fmuld   %f0, %f2, %f30          ;\
        faddd   %f0, %f2, %f48          ;\
        fmuld   %f0, %f2, %f50          ;\
        faddd   %f0, %f2, %f52          ;\
        fmuld   %f0, %f2, %f54          ;\
        faddd   %f0, %f2, %f56          ;\
        fmuld   %f0, %f2, %f58          ;\
        faddd   %f0, %f2, %f60          ;\
        fmuld   %f0, %f2, %f62

/*
 * Macros to save and restore fp registers to/from the stack.
 * Used to save and restore in-use fp registers when we want to use FP.
 */
#define BST_FP_TOSTACK(tmp1)                                    \
        /* membar #Sync */                                      ;\
        add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
        and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
        stda    %f0, [tmp1]ASI_BLK_P                            ;\
        add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
        stda    %f16, [tmp1]ASI_BLK_P                           ;\
        add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
        stda    %f48, [tmp1]ASI_BLK_P                           ;\
        membar  #Sync

#define BLD_FP_FROMSTACK(tmp1)                                  \
        /* membar #Sync - provided at copy completion */        ;\
        add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
        and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
        ldda    [tmp1]ASI_BLK_P, %f0                            ;\
        add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
        ldda    [tmp1]ASI_BLK_P, %f16                           ;\
        add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
        ldda    [tmp1]ASI_BLK_P, %f48                           ;\
        membar  #Sync

#endif /* !NIAGARA_IMPL */

/*
 * Copy a block of storage, returning an error code if `from' or
 * `to' takes a kernel pagefault which cannot be resolved.
 * Returns errno value on pagefault error, 0 if all ok
 */

        .seg    ".text"
        .align  4

        ENTRY(kcopy)
#if !defined(NIAGARA_IMPL)
        cmp     %o2, FP_COPY                    ! check for small copy/leaf case
        bgt,pt  %ncc, .kcopy_more               !
        nop
.kcopy_small:                                   ! setup error handler
        sethi   %hi(.sm_copyerr), %o4
        or      %o4, %lo(.sm_copyerr), %o4      ! .sm_copyerr is lofault value
        ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
        ! Note that we carefully do *not* flag the setting of
        ! t_lofault.
        membar  #Sync                           ! sync error barrier
        b       .sm_do_copy                     ! common code
        stn     %o4, [THREAD_REG + T_LOFAULT]   ! set t_lofault


.kcopy_more:
        save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
        sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
        or      %l7, %lo(.copyerr), %l7
        ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
        ! Note that we carefully do *not* flag the setting of
        ! t_lofault.
        membar  #Sync                           ! sync error barrier
        b       .do_copy                        ! common code
        stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault

/*
 * We got here because of a fault during a small kcopy or bcopy.
 * if a fault handler existed when bcopy was called.
 * No floating point registers are used by the small copies.
 * Small copies are from a leaf routine
 * Errno value is in %g1.
 */
.sm_copyerr:
        ! The kcopy will always set a t_lofault handler. If it fires,
        ! we're expected to just return the error code and not to
        ! invoke any existing error handler. As far as bcopy is concerned,
        ! we only set t_lofault if there was an existing lofault handler.
        ! In that case we're expected to invoke the previously existing
        ! handler after resetting the t_lofault value.
        btst    LOFAULT_SET, %o5
        membar  #Sync                           ! sync error barrier
        andn    %o5, LOFAULT_SET, %o5           ! clear fault flag
        bnz,pn  %ncc, 3f
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
        retl
        mov     %g1, %o0
3:
        ! We're here via bcopy. There must have been an error handler
        ! in place otherwise we would have died a nasty death already.
        jmp     %o5                             ! goto real handler
        mov     %g0, %o0
/*
 *  end of .sm_copyerr
 */

/*
 * We got here because of a fault during kcopy or bcopy if a fault
 * handler existed when bcopy was called.
 * stack and fp registers need to be restored
 * Errno value is in %g1.
 */
.copyerr:
        sethi   %hi(.copyerr2), %l1
        or      %l1, %lo(.copyerr2), %l1
        membar  #Sync                           ! sync error barrier
        stn     %l1, [THREAD_REG + T_LOFAULT]   ! set t_lofault
        btst    FPUSED_FLAG, %o5
        bz,pt   %xcc, 1f
        and     %o5, LOFAULT_SET, %l1   ! copy flag to %l1

        membar  #Sync                           ! sync error barrier
        wr      %l5, 0, %gsr
        btst    FPRS_FEF, %g5
        bz,pt   %icc, 4f
        nop
        ! restore fpregs from stack
        BLD_FP_FROMSTACK(%o2)
        ba,pt   %ncc, 2f
        wr      %g5, 0, %fprs           ! restore fprs
4:
        FZERO
        wr      %g5, 0, %fprs           ! restore fprs
2:
        ldn     [THREAD_REG + T_LWP], %o2
        brnz,pt %o2, 1f
        nop

        ldsb    [THREAD_REG + T_PREEMPT], %l0
        deccc   %l0
        bnz,pn  %ncc, 1f
        stb     %l0, [THREAD_REG + T_PREEMPT]

        ! Check for a kernel preemption request
        ldn     [THREAD_REG + T_CPU], %l0
        ldub    [%l0 + CPU_KPRUNRUN], %l0
        brnz,a,pt       %l0, 1f ! Need to call kpreempt?
        or      %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag

        ! The kcopy will always set a t_lofault handler. If it fires,
        ! we're expected to just return the error code and not to
        ! invoke any existing error handler. As far as bcopy is concerned,
        ! we only set t_lofault if there was an existing lofault handler.
        ! In that case we're expected to invoke the previously existing
        ! handler after resetting the t_lofault value.
1:
        andn    %o5, COPY_FLAGS, %o5    ! remove flags from lofault address
        membar  #Sync                           ! sync error barrier
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault

        ! call kpreempt if necessary
        btst    KPREEMPT_FLAG, %l1
        bz,pt   %icc, 2f
        nop
        call    kpreempt
        rdpr    %pil, %o0       ! pass %pil
2:
        btst    LOFAULT_SET, %l1
        bnz,pn  %ncc, 3f
        nop
        ret
        restore %g1, 0, %o0
3:
        ! We're here via bcopy. There must have been an error handler
        ! in place otherwise we would have died a nasty death already.
        jmp     %o5                             ! goto real handler
        restore %g0, 0, %o0                     ! dispose of copy window

/*
 * We got here because of a fault in .copyerr.  We can't safely restore fp
 * state, so we panic.
 */
fp_panic_msg:
        .asciz  "Unable to restore fp state after copy operation"

        .align  4
.copyerr2:
        set     fp_panic_msg, %o0
        call    panic
        nop
/*
 *  end of .copyerr
 */

#else   /* NIAGARA_IMPL */
        save    %sp, -SA(MINFRAME), %sp
        set     .copyerr, %l7                   ! copyerr is lofault value
        ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
        or      %o5, LOFAULT_SET, %o5
        membar  #Sync                           ! sync error barrier
        b       .do_copy                        ! common code
        stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault

/*
 * We got here because of a fault during kcopy.
 * Errno value is in %g1.
 */
.copyerr:
        ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
        ! into %o5 to indicate it has set t_lofault handler. Need to clear
        ! LOFAULT_SET flag before restoring the error handler.
        andn    %o5, LOFAULT_SET, %o5
        membar  #Sync                           ! sync error barrier
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
        ret
        restore %g1, 0, %o0
#endif  /* NIAGARA_IMPL */

        SET_SIZE(kcopy)


/*
 * Copy a block of storage - must not overlap (from + len <= to).
 */

        ENTRY(bcopy)
#if !defined(NIAGARA_IMPL)
        cmp     %o2, FP_COPY                    ! check for small copy/leaf case
        bgt,pt  %ncc, .bcopy_more               !
        nop
.bcopy_small:                                   ! setup error handler
        ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
        tst     %o5
        bz,pt   %icc, .sm_do_copy
        sethi   %hi(.sm_copyerr), %o4
        or      %o4, %lo(.sm_copyerr), %o4      ! .sm_copyerr is lofault value
        membar  #Sync                           ! sync error barrier
        stn     %o4, [THREAD_REG + T_LOFAULT]   ! set t_lofault
        or      %o5, LOFAULT_SET, %o5           ! Error should trampoline
.sm_do_copy:
        mov     %o0, %g1                ! save %o0
        cmp     %o2, SHORTCOPY          ! make sure there is enough to align
        ble,pt  %ncc, .bc_smallest
        andcc   %o1, 0x7, %o3           ! is dest long aligned
        bnz,pn  %ncc, .bc_align
        andcc   %o1, 1, %o3             ! is dest byte aligned

! Destination is long word aligned
.bc_al_src:
        andcc   %o0, 7, %o3
        brnz,pt %o3, .bc_src_dst_unal8
        nop
/*
 * Special case for handling when src and dest are both long word aligned
 * and total data to move is less than FP_COPY bytes
 * Also handles finish up for large block moves, so may be less than 32 bytes
 */
.bc_medlong:
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .bc_medl31
        nop
.bc_medl32:
        ldx     [%o0], %o4              ! move 32 bytes
        subcc   %o2, 32, %o2            ! decrement length count by 32
        stx     %o4, [%o1]
        ldx     [%o0+8], %o4
        stx     %o4, [%o1+8]
        ldx     [%o0+16], %o4
        add     %o0, 32, %o0            ! increase src ptr by 32
        stx     %o4, [%o1+16]
        ldx     [%o0-8], %o4
        add     %o1, 32, %o1            ! increase dst ptr by 32
        bgu,pt  %ncc, .bc_medl32        ! repeat if at least 32 bytes left
        stx     %o4, [%o1-8]
.bc_medl31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .bc_medl7         ! skip if 7 or fewer bytes left
        nop
.bc_medl8:
        ldx     [%o0], %o4              ! move 8 bytes
        add     %o0, 8, %o0             ! increase src ptr by 8
        subcc   %o2, 8, %o2             ! decrease count by 8
        add     %o1, 8, %o1             ! increase dst ptr by 8
        bgu,pt  %ncc, .bc_medl8
        stx     %o4, [%o1-8]
.bc_medl7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bnz,pt  %ncc, .bc_small4        ! do final bytes if not finished

.bc_smallx:                             ! finish up and exit
        tst     %o5
        bz,pt   %ncc, .bc_sm_done
        andn    %o5, COPY_FLAGS, %o5    ! remove flags from lofault address
        membar  #Sync                   ! sync error barrier
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
.bc_sm_done:
        retl
        mov     %g0, %o0

.bc_small4:
        cmp     %o2, 4
        blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
        nop                             !
        ld      [%o0], %o4              ! move 4 bytes
        add     %o0, 4, %o0             ! increase src ptr by 4
        add     %o1, 4, %o1             ! increase dst ptr by 4
        subcc   %o2, 4, %o2             ! decrease count by 4
        bz,pt   %ncc, .bc_smallx
        stw     %o4, [%o1-4]

.bc_small3x:                            ! Exactly 1, 2, or 3 bytes remain
        subcc   %o2, 1, %o2             ! reduce count for cc test
        ldub    [%o0], %o4              ! load one byte
        bz,pt   %ncc, .bc_smallx
        stb     %o4, [%o1]              ! store one byte
        ldub    [%o0+1], %o4            ! load second byte
        subcc   %o2, 1, %o2
        bz,pt   %ncc, .bc_smallx
        stb     %o4, [%o1+1]            ! store second byte
        ldub    [%o0+2], %o4            ! load third byte
        ba      .bc_smallx
        stb     %o4, [%o1+2]            ! store third byte

.bc_smallest:                           ! 7 or fewer bytes remain
        tst     %o2
        bz,pt   %ncc, .bc_smallx
        cmp     %o2, 4
        blt,pt  %ncc, .bc_small3x
        nop
        ldub    [%o0], %o4              ! read byte
        subcc   %o2, 4, %o2             ! reduce count by 4
        stb     %o4, [%o1]              ! write byte
        ldub    [%o0+1], %o4            ! repeat for total of 4 bytes
        add     %o0, 4, %o0             ! advance src by 4
        stb     %o4, [%o1+1]
        ldub    [%o0-2], %o4
        add     %o1, 4, %o1             ! advance dst by 4
        stb     %o4, [%o1-2]
        ldub    [%o0-1], %o4
        bnz,pt  %ncc, .bc_small3x
        stb     %o4, [%o1-1]
        ba      .bc_smallx
        nop

/*
 * Align destination to long word boundary
 */
.bc_align:                              ! byte align test in prior branch delay
        bnz,pt  %ncc, .bc_al_d1
.bc_al_d1f:                             ! dest is now half word aligned
        andcc   %o1, 2, %o3
        bnz,pt  %ncc, .bc_al_d2
.bc_al_d2f:                             ! dest is now word aligned
        andcc   %o1, 4, %o3             ! is dest longword aligned?
        bz,pt   %ncc, .bc_al_src
        nop
.bc_al_d4:                              ! dest is word aligned;  src is unknown
        ldub    [%o0], %o4              ! move a word (src align unknown)
        ldub    [%o0+1], %o3
        sll     %o4, 24, %o4            ! position
        sll     %o3, 16, %o3            ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%o0+2], %o4
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%o0+3], %o4
        or      %o4, %o3, %o4           ! merge
        stw     %o4,[%o1]               ! store four bytes
        add     %o0, 4, %o0             ! adjust src by 4
        add     %o1, 4, %o1             ! adjust dest by 4
        sub     %o2, 4, %o2             ! adjust count by 4
        andcc   %o0, 7, %o3             ! check for src long word alignment
        brz,pt  %o3, .bc_medlong
.bc_src_dst_unal8:
        ! dst is 8-byte aligned, src is not
        ! Size is less than FP_COPY
        ! Following code is to select for alignment
        andcc   %o0, 0x3, %o3           ! test word alignment
        bz,pt   %ncc, .bc_medword
        nop
        andcc   %o0, 0x1, %o3           ! test halfword alignment
        bnz,pt  %ncc, .bc_med_byte      ! go to byte move if not halfword
        andcc   %o0, 0x2, %o3           ! test which byte alignment
        ba      .bc_medhalf
        nop
.bc_al_d1:                              ! align dest to half word
        ldub    [%o0], %o4              ! move a byte
        add     %o0, 1, %o0
        stb     %o4, [%o1]
        add     %o1, 1, %o1
        andcc   %o1, 2, %o3
        bz,pt   %ncc, .bc_al_d2f
        sub     %o2, 1, %o2
.bc_al_d2:                              ! align dest to word
        ldub    [%o0], %o4              ! move a half-word (src align unknown)
        ldub    [%o0+1], %o3
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o4           ! merge
        sth     %o4, [%o1]
        add     %o0, 2, %o0
        add     %o1, 2, %o1
        andcc   %o1, 4, %o3             ! is dest longword aligned?
        bz,pt   %ncc, .bc_al_src
        sub     %o2, 2, %o2
        ba      .bc_al_d4
        nop
/*
 * Handle all cases where src and dest are aligned on word
 * boundaries. Use unrolled loops for better performance.
 * This option wins over standard large data move when
 * source and destination is in cache for medium
 * to short data moves.
 */
.bc_medword:
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .bc_medw31
        nop
.bc_medw32:
        ld      [%o0], %o4              ! move a block of 32 bytes
        stw     %o4, [%o1]
        ld      [%o0+4], %o4
        stw     %o4, [%o1+4]
        ld      [%o0+8], %o4
        stw     %o4, [%o1+8]
        ld      [%o0+12], %o4
        stw     %o4, [%o1+12]
        ld      [%o0+16], %o4
        stw     %o4, [%o1+16]
        ld      [%o0+20], %o4
        subcc   %o2, 32, %o2            ! decrement length count
        stw     %o4, [%o1+20]
        ld      [%o0+24], %o4
        add     %o0, 32, %o0            ! increase src ptr by 32
        stw     %o4, [%o1+24]
        ld      [%o0-4], %o4
        add     %o1, 32, %o1            ! increase dst ptr by 32
        bgu,pt  %ncc, .bc_medw32        ! repeat if at least 32 bytes left
        stw     %o4, [%o1-4]
.bc_medw31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .bc_medw7         ! skip if 7 or fewer bytes left
        nop                             !
.bc_medw15:
        ld      [%o0], %o4              ! move a block of 8 bytes
        subcc   %o2, 8, %o2             ! decrement length count
        stw     %o4, [%o1]
        add     %o0, 8, %o0             ! increase src ptr by 8
        ld      [%o0-4], %o4
        add     %o1, 8, %o1             ! increase dst ptr by 8
        bgu,pt  %ncc, .bc_medw15
        stw     %o4, [%o1-4]
.bc_medw7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bz,pt   %ncc, .bc_smallx        ! exit if finished
        cmp     %o2, 4
        blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
        nop                             !
        ld      [%o0], %o4              ! move 4 bytes
        add     %o0, 4, %o0             ! increase src ptr by 4
        add     %o1, 4, %o1             ! increase dst ptr by 4
        subcc   %o2, 4, %o2             ! decrease count by 4
        bnz     .bc_small3x
        stw     %o4, [%o1-4]
        ba      .bc_smallx
        nop

.bc_medhalf:
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .bc_medh31
        nop
.bc_medh32:                             ! load and store block of 32 bytes
        subcc   %o2, 32, %o2            ! decrement length count

        lduh    [%o0], %o4              ! move 32 bytes
        lduw    [%o0+2], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduh    [%o0+6], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1]

        lduh    [%o0+8], %o4
        lduw    [%o0+10], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduh    [%o0+14], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+8]

        lduh    [%o0+16], %o4
        lduw    [%o0+18], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduh    [%o0+22], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+16]

        add     %o0, 32, %o0            ! increase src ptr by 32
        add     %o1, 32, %o1            ! increase dst ptr by 32

        lduh    [%o0-8], %o4
        lduw    [%o0-6], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduh    [%o0-2], %o4
        or      %o3, %o4, %o4
        bgu,pt  %ncc, .bc_medh32        ! repeat if at least 32 bytes left
        stx     %o4, [%o1-8]

.bc_medh31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .bc_medh7         ! skip if 7 or fewer bytes left
        nop                             !
.bc_medh15:
        lduh    [%o0], %o4              ! move 16 bytes
        subcc   %o2, 8, %o2             ! decrement length count
        lduw    [%o0+2], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        add     %o1, 8, %o1             ! increase dst ptr by 8
        lduh    [%o0+6], %o4
        add     %o0, 8, %o0             ! increase src ptr by 8
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .bc_medh15
        stx     %o4, [%o1-8]
.bc_medh7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bz,pt   %ncc, .bc_smallx        ! exit if finished
        cmp     %o2, 4
        blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
        nop                             !
        lduh    [%o0], %o4
        sll     %o4, 16, %o4
        lduh    [%o0+2], %o3
        or      %o3, %o4, %o4
        subcc   %o2, 4, %o2
        add     %o0, 4, %o0
        add     %o1, 4, %o1
        bnz     .bc_small3x
        stw     %o4, [%o1-4]
        ba      .bc_smallx
        nop

        .align 16
.bc_med_byte:
        bnz,pt  %ncc, .bc_medbh32a      ! go to correct byte move
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .bc_medb31
        nop
.bc_medb32:                             ! Alignment 1 or 5
        subcc   %o2, 32, %o2            ! decrement length count

        ldub    [%o0], %o4              ! load and store a block of 32 bytes
        sllx    %o4, 56, %o3
        lduh    [%o0+1], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0+3], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+7], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1]

        ldub    [%o0+8], %o4
        sllx    %o4, 56, %o3
        lduh    [%o0+9], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0+11], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+15], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+8]

        ldub    [%o0+16], %o4
        sllx    %o4, 56, %o3
        lduh    [%o0+17], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0+19], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+23], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+16]

        add     %o0, 32, %o0            ! increase src ptr by 32
        add     %o1, 32, %o1            ! increase dst ptr by 32

        ldub    [%o0-8], %o4
        sllx    %o4, 56, %o3
        lduh    [%o0-7], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0-5], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0-1], %o4
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .bc_medb32        ! repeat if at least 32 bytes left
        stx     %o4, [%o1-8]

.bc_medb31:                             ! 31 or fewer bytes remaining
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .bc_medb7         ! skip if 7 or fewer bytes left
        nop                             !
.bc_medb15:

        ldub    [%o0], %o4              ! load and store a block of 8 bytes
        subcc   %o2, 8, %o2             ! decrement length count
        sllx    %o4, 56, %o3
        lduh    [%o0+1], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0+3], %o4
        add     %o1, 8, %o1             ! increase dst ptr by 16
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+7], %o4
        add     %o0, 8, %o0             ! increase src ptr by 16
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .bc_medb15
        stx     %o4, [%o1-8]
.bc_medb7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bz,pt   %ncc, .bc_smallx        ! exit if finished
        cmp     %o2, 4
        blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
        nop                             !
        ldub    [%o0], %o4              ! move 4 bytes
        sll     %o4, 24, %o3
        lduh    [%o0+1], %o4
        sll     %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+3], %o4
        or      %o4, %o3, %o4
        subcc   %o2, 4, %o2
        add     %o0, 4, %o0
        add     %o1, 4, %o1
        bnz     .bc_small3x
        stw     %o4, [%o1-4]
        ba      .bc_smallx
        nop

        .align 16
.bc_medbh32a:                           ! Alignment 3 or 7
        ble,pt  %ncc, .bc_medbh31
        nop
.bc_medbh32:                            ! Alignment 3 or 7
        subcc   %o2, 32, %o2            ! decrement length count

        ldub    [%o0], %o4              ! load and store a block of 32 bytes
        sllx    %o4, 56, %o3
        lduw    [%o0+1], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0+5], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+7], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1]

        ldub    [%o0+8], %o4
        sllx    %o4, 56, %o3
        lduw    [%o0+9], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0+13], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+15], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+8]

        ldub    [%o0+16], %o4
        sllx    %o4, 56, %o3
        lduw    [%o0+17], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0+21], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+23], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+16]

        add     %o0, 32, %o0            ! increase src ptr by 32
        add     %o1, 32, %o1            ! increase dst ptr by 32

        ldub    [%o0-8], %o4
        sllx    %o4, 56, %o3
        lduw    [%o0-7], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0-3], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0-1], %o4
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .bc_medbh32       ! repeat if at least 32 bytes left
        stx     %o4, [%o1-8]

.bc_medbh31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .bc_medb7         ! skip if 7 or fewer bytes left
        nop                             !
.bc_medbh15:
        ldub    [%o0], %o4              ! load and store a block of 8 bytes
        sllx    %o4, 56, %o3
        lduw    [%o0+1], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0+5], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+7], %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1]
        subcc   %o2, 8, %o2             ! decrement length count
        add     %o1, 8, %o1             ! increase dst ptr by 8
        add     %o0, 8, %o0             ! increase src ptr by 8
        bgu,pt  %ncc, .bc_medbh15
        stx     %o4, [%o1-8]
        ba      .bc_medb7
        nop

        SET_SIZE(bcopy)
/*
 * The _more entry points are not intended to be used directly by
 * any caller from outside this file.  They are provided to allow
 * profiling and dtrace of the portions of the copy code that uses
 * the floating point registers.
*/
        ENTRY(bcopy_more)
.bcopy_more:
        save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
        ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
        brz,pt  %o5, .do_copy
        nop
        sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
        or      %l7, %lo(.copyerr), %l7
        membar  #Sync                           ! sync error barrier
        stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
        ! We've already captured whether t_lofault was zero on entry.
        ! We need to mark ourselves as being from bcopy since both
        ! kcopy and bcopy use the same code path. If LOFAULT_SET is
        ! set and the saved lofault was zero, we won't reset lofault on
        ! returning.
        or      %o5, LOFAULT_SET, %o5
.do_copy:
        ldn     [THREAD_REG + T_LWP], %o3
        brnz,pt %o3, 1f
        nop
/*
 * kpreempt_disable();
 */
        ldsb    [THREAD_REG +T_PREEMPT], %o3
        inc     %o3
        stb     %o3, [THREAD_REG + T_PREEMPT]
1:
/*
 * Following code is for large copies. We know there is at
 * least FP_COPY bytes available. FP regs are used, so
 *  we save registers and fp regs before starting
 */
        rd      %fprs, %g5              ! check for unused fp
        or      %o5,FPUSED_FLAG,%o5
        ! if fprs.fef == 0, set it.
        ! Setting it when already set costs more than checking
        andcc   %g5, FPRS_FEF, %g5      ! test FEF, fprs.du = fprs.dl = 0
        bz,pt   %ncc, .bc_fp_unused
        prefetch [%i0 + (1 * CACHE_LINE)], #one_read
        BST_FP_TOSTACK(%o3)
        ba      .bc_fp_ready
.bc_fp_unused:
        andcc   %i1, 1, %o3             ! is dest byte aligned
        wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
.bc_fp_ready:
        rd      %gsr, %l5               ! save %gsr value
        bnz,pt  %ncc, .bc_big_d1
.bc_big_d1f:                            ! dest is now half word aligned
        andcc   %i1, 2, %o3
        bnz,pt  %ncc, .bc_big_d2
.bc_big_d2f:                            ! dest is now word aligned
        andcc   %i1, 4, %o3
        bnz,pt  %ncc, .bc_big_d4
.bc_big_d4f:                            ! dest is now long word aligned
        andcc   %i0, 7, %o3             ! is src long word aligned
        brnz,pt %o3, .bc_big_unal8
        prefetch [%i0 + (2 * CACHE_LINE)], #one_read

        ! Src and dst are long word aligned
        ! align dst to 64 byte boundary
        andcc   %i1, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
        brz,pn  %o3, .bc_al_to_64
        nop
        sub     %o3, 64, %o3            ! %o3 has negative bytes to move
        add     %i2, %o3, %i2           ! adjust remaining count
        andcc   %o3, 8, %o4             ! odd long words to move?
        brz,pt  %o4, .bc_al_to_16
        nop
        add     %o3, 8, %o3
        ldx     [%i0], %o4
        add     %i0, 8, %i0             ! increment src ptr
        add     %i1, 8, %i1             ! increment dst ptr
        stx     %o4, [%i1-8]
! Dest is aligned on 16 bytes, src 8 byte aligned
.bc_al_to_16:
        andcc   %o3, 0x30, %o4          ! pair of long words to move?
        brz,pt  %o4, .bc_al_to_64
        nop
.bc_al_mv_16:
        add     %o3, 16, %o3
        ldx     [%i0], %o4
        stx     %o4, [%i1]
        ldx     [%i0+8], %o4
        add     %i0, 16, %i0            ! increment src ptr
        stx     %o4, [%i1+8]
        andcc   %o3, 48, %o4
        brnz,pt %o4, .bc_al_mv_16
        add     %i1, 16, %i1            ! increment dst ptr
! Dest is aligned on 64 bytes, src 8 byte aligned
.bc_al_to_64:
        ! Determine source alignment
        ! to correct 8 byte offset
        andcc   %i0, 32, %o3
        brnz,pn %o3, .bc_aln_1
        andcc   %i0, 16, %o3
        brnz,pn %o3, .bc_aln_01
        andcc   %i0, 8, %o3
        brz,pn  %o3, .bc_aln_000
        prefetch [%i0 + (3 * CACHE_LINE)], #one_read
        ba      .bc_aln_001
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read

.bc_aln_01:
        brnz,pn %o3, .bc_aln_011
        prefetch [%i0 + (3 * CACHE_LINE)], #one_read
        ba      .bc_aln_010
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.bc_aln_1:
        andcc   %i0, 16, %o3
        brnz,pn %o3, .bc_aln_11
        andcc   %i0, 8, %o3
        brnz,pn %o3, .bc_aln_101
        prefetch [%i0 + (3 * CACHE_LINE)], #one_read
        ba      .bc_aln_100
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.bc_aln_11:
        brz,pn  %o3, .bc_aln_110
        prefetch [%i0 + (3 * CACHE_LINE)], #one_read

.bc_aln_111:
! Alignment off by 8 bytes
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        ldd     [%i0], %d0
        add     %i0, 8, %i0
        sub     %i2, 8, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.bc_aln_111_loop:
        ldda    [%i0]ASI_BLK_P,%d16             ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d2
        fmovd   %d18, %d4
        fmovd   %d20, %d6
        fmovd   %d22, %d8
        fmovd   %d24, %d10
        fmovd   %d26, %d12
        fmovd   %d28, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d30, %d0
        bgt,pt  %ncc, .bc_aln_111_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        ba      .bc_remain_stuff
        add     %i1, 8, %i1
        ! END OF aln_111

.bc_aln_110:
! Alignment off by 16 bytes
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        add     %i0, 16, %i0
        sub     %i2, 16, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.bc_aln_110_loop:
        ldda    [%i0]ASI_BLK_P,%d16             ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d4
        fmovd   %d18, %d6
        fmovd   %d20, %d8
        fmovd   %d22, %d10
        fmovd   %d24, %d12
        fmovd   %d26, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d28, %d0
        fmovd   %d30, %d2
        bgt,pt  %ncc, .bc_aln_110_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        ba      .bc_remain_stuff
        add     %i1, 16, %i1
        ! END OF aln_110

.bc_aln_101:
! Alignment off by 24 bytes
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16], %d4
        add     %i0, 24, %i0
        sub     %i2, 24, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.bc_aln_101_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d6
        fmovd   %d18, %d8
        fmovd   %d20, %d10
        fmovd   %d22, %d12
        fmovd   %d24, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d26, %d0
        fmovd   %d28, %d2
        fmovd   %d30, %d4
        bgt,pt  %ncc, .bc_aln_101_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        ba      .bc_remain_stuff
        add     %i1, 24, %i1
        ! END OF aln_101

.bc_aln_100:
! Alignment off by 32 bytes
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16],%d4
        ldd     [%i0+24],%d6
        add     %i0, 32, %i0
        sub     %i2, 32, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.bc_aln_100_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d8
        fmovd   %d18, %d10
        fmovd   %d20, %d12
        fmovd   %d22, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d24, %d0
        fmovd   %d26, %d2
        fmovd   %d28, %d4
        fmovd   %d30, %d6
        bgt,pt  %ncc, .bc_aln_100_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        std     %d6, [%i1+24]
        ba      .bc_remain_stuff
        add     %i1, 32, %i1
        ! END OF aln_100

.bc_aln_011:
! Alignment off by 40 bytes
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16], %d4
        ldd     [%i0+24], %d6
        ldd     [%i0+32], %d8
        add     %i0, 40, %i0
        sub     %i2, 40, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.bc_aln_011_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d10
        fmovd   %d18, %d12
        fmovd   %d20, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d22, %d0
        fmovd   %d24, %d2
        fmovd   %d26, %d4
        fmovd   %d28, %d6
        fmovd   %d30, %d8
        bgt,pt  %ncc, .bc_aln_011_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        std     %d6, [%i1+24]
        std     %d8, [%i1+32]
        ba      .bc_remain_stuff
        add     %i1, 40, %i1
        ! END OF aln_011

.bc_aln_010:
! Alignment off by 48 bytes
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16], %d4
        ldd     [%i0+24], %d6
        ldd     [%i0+32], %d8
        ldd     [%i0+40], %d10
        add     %i0, 48, %i0
        sub     %i2, 48, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.bc_aln_010_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d12
        fmovd   %d18, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d20, %d0
        fmovd   %d22, %d2
        fmovd   %d24, %d4
        fmovd   %d26, %d6
        fmovd   %d28, %d8
        fmovd   %d30, %d10
        bgt,pt  %ncc, .bc_aln_010_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        std     %d6, [%i1+24]
        std     %d8, [%i1+32]
        std     %d10, [%i1+40]
        ba      .bc_remain_stuff
        add     %i1, 48, %i1
        ! END OF aln_010

.bc_aln_001:
! Alignment off by 56 bytes
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16], %d4
        ldd     [%i0+24], %d6
        ldd     [%i0+32], %d8
        ldd     [%i0+40], %d10
        ldd     [%i0+48], %d12
        add     %i0, 56, %i0
        sub     %i2, 56, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.bc_aln_001_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d18, %d0
        fmovd   %d20, %d2
        fmovd   %d22, %d4
        fmovd   %d24, %d6
        fmovd   %d26, %d8
        fmovd   %d28, %d10
        fmovd   %d30, %d12
        bgt,pt  %ncc, .bc_aln_001_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        std     %d6, [%i1+24]
        std     %d8, [%i1+32]
        std     %d10, [%i1+40]
        std     %d12, [%i1+48]
        ba      .bc_remain_stuff
        add     %i1, 56, %i1
        ! END OF aln_001

.bc_aln_000:
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.bc_aln_000_loop:
        ldda    [%i0]ASI_BLK_P,%d0
        subcc   %o3, 64, %o3
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        bgt,pt  %ncc, .bc_aln_000_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        ! END OF aln_000

.bc_remain_stuff:
        subcc   %i2, 31, %i2            ! adjust length to allow cc test
        ble,pt  %ncc, .bc_aln_31
        nop
.bc_aln_32:
        ldx     [%i0], %o4              ! move 32 bytes
        subcc   %i2, 32, %i2            ! decrement length count by 32
        stx     %o4, [%i1]
        ldx     [%i0+8], %o4
        stx     %o4, [%i1+8]
        ldx     [%i0+16], %o4
        add     %i0, 32, %i0            ! increase src ptr by 32
        stx     %o4, [%i1+16]
        ldx     [%i0-8], %o4
        add     %i1, 32, %i1            ! increase dst ptr by 32
        bgu,pt  %ncc, .bc_aln_32        ! repeat if at least 32 bytes left
        stx     %o4, [%i1-8]
.bc_aln_31:
        addcc   %i2, 24, %i2            ! adjust count to be off by 7
        ble,pt  %ncc, .bc_aln_7         ! skip if 7 or fewer bytes left
        nop                             !
.bc_aln_15:
        ldx     [%i0], %o4              ! move 8 bytes
        add     %i0, 8, %i0             ! increase src ptr by 8
        subcc   %i2, 8, %i2             ! decrease count by 8
        add     %i1, 8, %i1             ! increase dst ptr by 8
        bgu,pt  %ncc, .bc_aln_15
        stx     %o4, [%i1-8]            !
.bc_aln_7:
        addcc   %i2, 7, %i2             ! finish adjustment of remaining count
        bz,pt   %ncc, .bc_exit          ! exit if finished
        cmp     %i2, 4
        blt,pt  %ncc, .bc_unaln3x       ! skip if less than 4 bytes left
        nop                             !
        ld      [%i0], %o4              ! move 4 bytes
        add     %i0, 4, %i0             ! increase src ptr by 4
        add     %i1, 4, %i1             ! increase dst ptr by 4
        subcc   %i2, 4, %i2             ! decrease count by 4
        bnz     .bc_unaln3x
        stw     %o4, [%i1-4]
        ba      .bc_exit
        nop

        ! destination alignment code
.bc_big_d1:
        ldub    [%i0], %o4              ! move a byte
        add     %i0, 1, %i0
        stb     %o4, [%i1]
        add     %i1, 1, %i1
        andcc   %i1, 2, %o3
        bz,pt   %ncc, .bc_big_d2f
        sub     %i2, 1, %i2
.bc_big_d2:
        ldub    [%i0], %o4              ! move a half-word (src align unknown)
        ldub    [%i0+1], %o3
        add     %i0, 2, %i0
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o4           ! merge
        sth     %o4, [%i1]
        add     %i1, 2, %i1
        andcc   %i1, 4, %o3
        bz,pt   %ncc, .bc_big_d4f
        sub     %i2, 2, %i2
.bc_big_d4:
        ldub    [%i0], %o4              ! move a word (src align unknown)
        ldub    [%i0+1], %o3
        sll     %o4, 24, %o4            ! position
        sll     %o3, 16, %o3            ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%i0+2], %o4
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%i0+3], %o4
        or      %o4, %o3, %o4           ! merge
        stw     %o4,[%i1]               ! store four bytes
        add     %i0, 4, %i0             ! adjust src by 4
        add     %i1, 4, %i1             ! adjust dest by 4
        ba      .bc_big_d4f
        sub     %i2, 4, %i2             ! adjust count by 4


        ! Dst is on 8 byte boundary; src is not;
.bc_big_unal8:
        andcc   %i1, 0x3f, %o3          ! is dst 64-byte block aligned?
        bz      %ncc, .bc_unalnsrc
        sub     %o3, 64, %o3            ! %o3 will be multiple of 8
        neg     %o3                     ! bytes until dest is 64 byte aligned
        sub     %i2, %o3, %i2           ! update cnt with bytes to be moved
        ! Move bytes according to source alignment
        andcc   %i0, 0x1, %o4
        bnz     %ncc, .bc_unalnbyte     ! check for byte alignment
        nop
        andcc   %i0, 2, %o4             ! check for half word alignment
        bnz     %ncc, .bc_unalnhalf
        nop
        ! Src is word aligned, move bytes until dest 64 byte aligned
.bc_unalnword:
        ld      [%i0], %o4              ! load 4 bytes
        stw     %o4, [%i1]              ! and store 4 bytes
        ld      [%i0+4], %o4            ! load 4 bytes
        add     %i0, 8, %i0             ! increase src ptr by 8
        stw     %o4, [%i1+4]            ! and store 4 bytes
        subcc   %o3, 8, %o3             ! decrease count by 8
        bnz     %ncc, .bc_unalnword
        add     %i1, 8, %i1             ! increase dst ptr by 8
        ba      .bc_unalnsrc
        nop

        ! Src is half-word aligned, move bytes until dest 64 byte aligned
.bc_unalnhalf:
        lduh    [%i0], %o4              ! load 2 bytes
        sllx    %o4, 32, %i3            ! shift left
        lduw    [%i0+2], %o4
        or      %o4, %i3, %i3
        sllx    %i3, 16, %i3
        lduh    [%i0+6], %o4
        or      %o4, %i3, %i3
        stx     %i3, [%i1]
        add     %i0, 8, %i0
        subcc   %o3, 8, %o3
        bnz     %ncc, .bc_unalnhalf
        add     %i1, 8, %i1
        ba      .bc_unalnsrc
        nop

        ! Src is Byte aligned, move bytes until dest 64 byte aligned
.bc_unalnbyte:
        sub     %i1, %i0, %i1           ! share pointer advance
.bc_unalnbyte_loop:
        ldub    [%i0], %o4
        sllx    %o4, 56, %i3
        lduh    [%i0+1], %o4
        sllx    %o4, 40, %o4
        or      %o4, %i3, %i3
        lduh    [%i0+3], %o4
        sllx    %o4, 24, %o4
        or      %o4, %i3, %i3
        lduh    [%i0+5], %o4
        sllx    %o4, 8, %o4
        or      %o4, %i3, %i3
        ldub    [%i0+7], %o4
        or      %o4, %i3, %i3
        stx     %i3, [%i1+%i0]
        subcc   %o3, 8, %o3
        bnz     %ncc, .bc_unalnbyte_loop
        add     %i0, 8, %i0
        add     %i1,%i0, %i1            ! restore pointer

        ! Destination is now block (64 byte aligned), src is not 8 byte aligned
.bc_unalnsrc:
        andn    %i2, 0x3f, %i3          ! %i3 is multiple of block size
        and     %i2, 0x3f, %i2          ! residue bytes in %i2
        add     %i2, 64, %i2            ! Insure we don't load beyond
        sub     %i3, 64, %i3            ! end of source buffer

        andn    %i0, 0x3f, %o4          ! %o4 has block aligned src address
        prefetch [%o4 + (3 * CACHE_LINE)], #one_read
        alignaddr %i0, %g0, %g0         ! generate %gsr
        add     %i0, %i3, %i0           ! advance %i0 to after blocks
        !
        ! Determine source alignment to correct 8 byte offset
        andcc   %i0, 0x20, %o3
        brnz,pn %o3, .bc_unaln_1
        andcc   %i0, 0x10, %o3
        brnz,pn %o3, .bc_unaln_01
        andcc   %i0, 0x08, %o3
        brz,a   %o3, .bc_unaln_000
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_001
        nop
.bc_unaln_01:
        brnz,a  %o3, .bc_unaln_011
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_010
        nop
.bc_unaln_1:
        brnz,pn %o3, .bc_unaln_11
        andcc   %i0, 0x08, %o3
        brnz,a  %o3, .bc_unaln_101
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_100
        nop
.bc_unaln_11:
        brz,pn  %o3, .bc_unaln_110
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read

.bc_unaln_111:
        ldd     [%o4+56], %d14
.bc_unaln_111_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d14, %d16, %d48
        faligndata %d16, %d18, %d50
        faligndata %d18, %d20, %d52
        faligndata %d20, %d22, %d54
        faligndata %d22, %d24, %d56
        faligndata %d24, %d26, %d58
        faligndata %d26, %d28, %d60
        faligndata %d28, %d30, %d62
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .bc_unaln_111_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_done
        nop

.bc_unaln_110:
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.bc_unaln_110_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d12, %d14, %d48
        faligndata %d14, %d16, %d50
        faligndata %d16, %d18, %d52
        faligndata %d18, %d20, %d54
        faligndata %d20, %d22, %d56
        faligndata %d22, %d24, %d58
        faligndata %d24, %d26, %d60
        faligndata %d26, %d28, %d62
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .bc_unaln_110_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_done
        nop

.bc_unaln_101:
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.bc_unaln_101_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d10, %d12, %d48
        faligndata %d12, %d14, %d50
        faligndata %d14, %d16, %d52
        faligndata %d16, %d18, %d54
        faligndata %d18, %d20, %d56
        faligndata %d20, %d22, %d58
        faligndata %d22, %d24, %d60
        faligndata %d24, %d26, %d62
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .bc_unaln_101_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_done
        nop

.bc_unaln_100:
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.bc_unaln_100_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d8, %d10, %d48
        faligndata %d10, %d12, %d50
        faligndata %d12, %d14, %d52
        faligndata %d14, %d16, %d54
        faligndata %d16, %d18, %d56
        faligndata %d18, %d20, %d58
        faligndata %d20, %d22, %d60
        faligndata %d22, %d24, %d62
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .bc_unaln_100_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_done
        nop

.bc_unaln_011:
        ldd     [%o4+24], %d6
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.bc_unaln_011_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d6, %d8, %d48
        faligndata %d8, %d10, %d50
        faligndata %d10, %d12, %d52
        faligndata %d12, %d14, %d54
        faligndata %d14, %d16, %d56
        faligndata %d16, %d18, %d58
        faligndata %d18, %d20, %d60
        faligndata %d20, %d22, %d62
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .bc_unaln_011_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_done
        nop

.bc_unaln_010:
        ldd     [%o4+16], %d4
        ldd     [%o4+24], %d6
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.bc_unaln_010_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d4, %d6, %d48
        faligndata %d6, %d8, %d50
        faligndata %d8, %d10, %d52
        faligndata %d10, %d12, %d54
        faligndata %d12, %d14, %d56
        faligndata %d14, %d16, %d58
        faligndata %d16, %d18, %d60
        faligndata %d18, %d20, %d62
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .bc_unaln_010_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_done
        nop

.bc_unaln_001:
        ldd     [%o4+8], %d2
        ldd     [%o4+16], %d4
        ldd     [%o4+24], %d6
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.bc_unaln_001_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d2, %d4, %d48
        faligndata %d4, %d6, %d50
        faligndata %d6, %d8, %d52
        faligndata %d8, %d10, %d54
        faligndata %d10, %d12, %d56
        faligndata %d12, %d14, %d58
        faligndata %d14, %d16, %d60
        faligndata %d16, %d18, %d62
        fmovd   %d18, %d2
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .bc_unaln_001_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .bc_unaln_done
        nop

.bc_unaln_000:
        ldda    [%o4]ASI_BLK_P, %d0
.bc_unaln_000_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d0, %d2, %d48
        faligndata %d2, %d4, %d50
        faligndata %d4, %d6, %d52
        faligndata %d6, %d8, %d54
        faligndata %d8, %d10, %d56
        faligndata %d10, %d12, %d58
        faligndata %d12, %d14, %d60
        faligndata %d14, %d16, %d62
        fmovd   %d16, %d0
        fmovd   %d18, %d2
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .bc_unaln_000_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read

.bc_unaln_done:
        ! Handle trailing bytes, 64 to 127
        ! Dest long word aligned, Src not long word aligned
        cmp     %i2, 15
        bleu    %ncc, .bc_unaln_short

        andn    %i2, 0x7, %i3           ! %i3 is multiple of 8
        and     %i2, 0x7, %i2           ! residue bytes in %i2
        add     %i2, 8, %i2
        sub     %i3, 8, %i3             ! insure we don't load past end of src
        andn    %i0, 0x7, %o4           ! %o4 has long word aligned src address
        add     %i0, %i3, %i0           ! advance %i0 to after multiple of 8
        ldd     [%o4], %d0              ! fetch partial word
.bc_unaln_by8:
        ldd     [%o4+8], %d2
        add     %o4, 8, %o4
        faligndata %d0, %d2, %d16
        subcc   %i3, 8, %i3
        std     %d16, [%i1]
        fmovd   %d2, %d0
        bgu,pt  %ncc, .bc_unaln_by8
        add     %i1, 8, %i1

.bc_unaln_short:
        cmp     %i2, 8
        blt,pt  %ncc, .bc_unalnfin
        nop
        ldub    [%i0], %o4
        sll     %o4, 24, %o3
        ldub    [%i0+1], %o4
        sll     %o4, 16, %o4
        or      %o4, %o3, %o3
        ldub    [%i0+2], %o4
        sll     %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%i0+3], %o4
        or      %o4, %o3, %o3
        stw     %o3, [%i1]
        ldub    [%i0+4], %o4
        sll     %o4, 24, %o3
        ldub    [%i0+5], %o4
        sll     %o4, 16, %o4
        or      %o4, %o3, %o3
        ldub    [%i0+6], %o4
        sll     %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%i0+7], %o4
        or      %o4, %o3, %o3
        stw     %o3, [%i1+4]
        add     %i0, 8, %i0
        add     %i1, 8, %i1
        sub     %i2, 8, %i2
.bc_unalnfin:
        cmp     %i2, 4
        blt,pt  %ncc, .bc_unalnz
        tst     %i2
        ldub    [%i0], %o3              ! read byte
        subcc   %i2, 4, %i2             ! reduce count by 4
        sll     %o3, 24, %o3            ! position
        ldub    [%i0+1], %o4
        sll     %o4, 16, %o4            ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%i0+2], %o4
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o3           ! merge
        add     %i1, 4, %i1             ! advance dst by 4
        ldub    [%i0+3], %o4
        add     %i0, 4, %i0             ! advance src by 4
        or      %o4, %o3, %o4           ! merge
        bnz,pt  %ncc, .bc_unaln3x
        stw     %o4, [%i1-4]
        ba      .bc_exit
        nop
.bc_unalnz:
        bz,pt   %ncc, .bc_exit
.bc_unaln3x:                            ! Exactly 1, 2, or 3 bytes remain
        subcc   %i2, 1, %i2             ! reduce count for cc test
        ldub    [%i0], %o4              ! load one byte
        bz,pt   %ncc, .bc_exit
        stb     %o4, [%i1]              ! store one byte
        ldub    [%i0+1], %o4            ! load second byte
        subcc   %i2, 1, %i2
        bz,pt   %ncc, .bc_exit
        stb     %o4, [%i1+1]            ! store second byte
        ldub    [%i0+2], %o4            ! load third byte
        stb     %o4, [%i1+2]            ! store third byte
.bc_exit:
        wr      %l5, %g0, %gsr          ! restore %gsr
        brnz    %g5, .bc_fp_restore
        and     %o5, COPY_FLAGS, %l1    ! save flags in %l1
        FZERO
        wr      %g5, %g0, %fprs
        ba,pt   %ncc, .bc_ex2
        nop
.bc_fp_restore:
        BLD_FP_FROMSTACK(%o4)
.bc_ex2:
        ldn     [THREAD_REG + T_LWP], %o2
        brnz,pt %o2, 1f
        nop

        ldsb    [THREAD_REG + T_PREEMPT], %l0
        deccc   %l0
        bnz,pn  %ncc, 1f
        stb     %l0, [THREAD_REG + T_PREEMPT]

        ! Check for a kernel preemption request
        ldn     [THREAD_REG + T_CPU], %l0
        ldub    [%l0 + CPU_KPRUNRUN], %l0
        brnz,a,pt       %l0, 1f ! Need to call kpreempt?
        or      %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
1:
        btst    LOFAULT_SET, %l1
        bz,pn   %icc, 3f
        andncc  %o5, COPY_FLAGS, %o5
        ! Here via bcopy. Check to see if the handler was NULL.
        ! If so, just return quietly. Otherwise, reset the
        ! handler and return.
        bz,pn %ncc, 2f
        nop
        membar  #Sync
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2:
        btst    KPREEMPT_FLAG, %l1
        bz,pt   %icc, 3f
        nop
        call    kpreempt
        rdpr    %pil, %o0               ! pass %pil
3:
        ret
        restore %g0, 0, %o0

        SET_SIZE(bcopy_more)


#else   /* NIAGARA_IMPL */
        save    %sp, -SA(MINFRAME), %sp
        clr     %o5                     ! flag LOFAULT_SET is not set for bcopy
.do_copy:
        cmp     %i2, 12                 ! for small counts
        blu     %ncc, .bytecp           ! just copy bytes
        .empty

        cmp     %i2, 128                ! for less than 128 bytes
        blu,pn  %ncc, .bcb_punt         ! no block st/quad ld
        nop

        set     use_hw_bcopy, %o2
        ld      [%o2], %o2
        brz,pn  %o2, .bcb_punt
        nop

        subcc   %i1, %i0, %i3
        bneg,a,pn %ncc, 1f
        neg     %i3
1:
        /*
         * Compare against 256 since we should be checking block addresses
         * and (dest & ~63) - (src & ~63) can be 3 blocks even if
         * src = dest + (64 * 3) + 63.
         */
        cmp     %i3, 256
        blu,pn  %ncc, .bcb_punt
        nop

        /*
         * Copy that reach here have at least 2 blocks of data to copy.
         */
.do_blockcopy:
        ! Swap src/dst since the code below is memcpy code
        ! and memcpy/bcopy have different calling sequences
        mov     %i1, %i5
        mov     %i0, %i1
        mov     %i5, %i0

        ! Block (64 bytes) align the destination.
        andcc   %i0, 0x3f, %i3          ! is dst aligned on a 64 bytes
        bz      %xcc, .chksrc           ! dst is already double aligned
        sub     %i3, 0x40, %i3
        neg     %i3                     ! bytes till dst 64 bytes aligned
        sub     %i2, %i3, %i2           ! update i2 with new count

        ! Based on source and destination alignment do
        ! either 8 bytes, 4 bytes, 2 bytes or byte copy.

        ! Is dst & src 8B aligned
        or      %i0, %i1, %o2
        andcc   %o2, 0x7, %g0
        bz      %ncc, .alewdcp
        nop

        ! Is dst & src 4B aligned
        andcc   %o2, 0x3, %g0
        bz      %ncc, .alwdcp
        nop

        ! Is dst & src 2B aligned
        andcc   %o2, 0x1, %g0
        bz      %ncc, .alhlfwdcp
        nop

        ! 1B aligned
1:      ldub    [%i1], %o2
        stb     %o2, [%i0]
        inc     %i1
        deccc   %i3
        bgu,pt  %ncc, 1b
        inc     %i0

        ba      .chksrc
        nop

        ! dst & src 4B aligned
.alwdcp:
        ld      [%i1], %o2
        st      %o2, [%i0]
        add     %i1, 0x4, %i1
        subcc   %i3, 0x4, %i3
        bgu,pt  %ncc, .alwdcp
        add     %i0, 0x4, %i0

        ba      .chksrc
        nop

        ! dst & src 2B aligned
.alhlfwdcp:
        lduh    [%i1], %o2
        stuh    %o2, [%i0]
        add     %i1, 0x2, %i1
        subcc   %i3, 0x2, %i3
        bgu,pt  %ncc, .alhlfwdcp
        add     %i0, 0x2, %i0

        ba      .chksrc
        nop

        ! dst & src 8B aligned
.alewdcp:
        ldx     [%i1], %o2
        stx     %o2, [%i0]
        add     %i1, 0x8, %i1
        subcc   %i3, 0x8, %i3
        bgu,pt  %ncc, .alewdcp
        add     %i0, 0x8, %i0

        ! Now Destination is block (64 bytes) aligned
.chksrc:
        andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
        sub     %i2, %i3, %i2           ! Residue bytes in %i2

        mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi

        andcc   %i1, 0xf, %o2           ! is src quadword aligned
        bz,pn   %xcc, .blkcpy           ! src offset in %o2
        nop
        cmp     %o2, 0x8
        bg      .cpy_upper_double
        nop
        bl      .cpy_lower_double
        nop

        ! Falls through when source offset is equal to 8 i.e.
        ! source is double word aligned.
        ! In this case no shift/merge of data is required
        sub     %i1, %o2, %i1           ! align the src at 16 bytes.
        andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
        prefetch [%l0+0x0], #one_read
        ldda    [%i1+0x0]%asi, %l2
loop0:
        ldda    [%i1+0x10]%asi, %l4
        prefetch [%l0+0x40], #one_read

        stxa    %l3, [%i0+0x0]%asi
        stxa    %l4, [%i0+0x8]%asi

        ldda    [%i1+0x20]%asi, %l2
        stxa    %l5, [%i0+0x10]%asi
        stxa    %l2, [%i0+0x18]%asi

        ldda    [%i1+0x30]%asi, %l4
        stxa    %l3, [%i0+0x20]%asi
        stxa    %l4, [%i0+0x28]%asi

        ldda    [%i1+0x40]%asi, %l2
        stxa    %l5, [%i0+0x30]%asi
        stxa    %l2, [%i0+0x38]%asi

        add     %l0, 0x40, %l0
        add     %i1, 0x40, %i1
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, loop0
        add     %i0, 0x40, %i0
        ba      .blkdone
        add     %i1, %o2, %i1           ! increment the source by src offset
                                        ! the src offset was stored in %o2

.cpy_lower_double:
        sub     %i1, %o2, %i1           ! align the src at 16 bytes.
        sll     %o2, 3, %o0             ! %o0 left shift
        mov     0x40, %o1
        sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
        andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
        prefetch [%l0+0x0], #one_read
        ldda    [%i1+0x0]%asi, %l2      ! partial data in %l2 and %l3 has
                                        ! complete data
loop1:
        ldda    [%i1+0x10]%asi, %l4     ! %l4 has partial data for this read.
        ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
                                                        ! into %l2 and %l3
        prefetch [%l0+0x40], #one_read
        stxa    %l2, [%i0+0x0]%asi
        stxa    %l3, [%i0+0x8]%asi

        ldda    [%i1+0x20]%asi, %l2
        ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
        stxa    %l4, [%i0+0x10]%asi                     ! %l4 from previous read
        stxa    %l5, [%i0+0x18]%asi                     ! into %l4 and %l5

        ! Repeat the same for next 32 bytes.

        ldda    [%i1+0x30]%asi, %l4
        ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
        stxa    %l2, [%i0+0x20]%asi
        stxa    %l3, [%i0+0x28]%asi

        ldda    [%i1+0x40]%asi, %l2
        ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
        stxa    %l4, [%i0+0x30]%asi
        stxa    %l5, [%i0+0x38]%asi

        add     %l0, 0x40, %l0
        add     %i1, 0x40, %i1
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, loop1
        add     %i0, 0x40, %i0
        ba      .blkdone
        add     %i1, %o2, %i1           ! increment the source by src offset
                                        ! the src offset was stored in %o2

.cpy_upper_double:
        sub     %i1, %o2, %i1           ! align the src at 16 bytes.
        mov     0x8, %o0
        sub     %o2, %o0, %o0
        sll     %o0, 3, %o0             ! %o0 left shift
        mov     0x40, %o1
        sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
        andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
        prefetch [%l0+0x0], #one_read
        ldda    [%i1+0x0]%asi, %l2      ! partial data in %l3 for this read and
                                        ! no data in %l2
loop2:
        ldda    [%i1+0x10]%asi, %l4     ! %l4 has complete data and %l5 has
                                        ! partial
        ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
                                                        ! into %l3 and %l4
        prefetch [%l0+0x40], #one_read
        stxa    %l3, [%i0+0x0]%asi
        stxa    %l4, [%i0+0x8]%asi

        ldda    [%i1+0x20]%asi, %l2
        ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
        stxa    %l5, [%i0+0x10]%asi                     ! %l5 from previous read
        stxa    %l2, [%i0+0x18]%asi                     ! into %l5 and %l2

        ! Repeat the same for next 32 bytes.

        ldda    [%i1+0x30]%asi, %l4
        ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
        stxa    %l3, [%i0+0x20]%asi
        stxa    %l4, [%i0+0x28]%asi

        ldda    [%i1+0x40]%asi, %l2
        ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
        stxa    %l5, [%i0+0x30]%asi
        stxa    %l2, [%i0+0x38]%asi

        add     %l0, 0x40, %l0
        add     %i1, 0x40, %i1
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, loop2
        add     %i0, 0x40, %i0
        ba      .blkdone
        add     %i1, %o2, %i1           ! increment the source by src offset
                                        ! the src offset was stored in %o2


        ! Both Source and Destination are block aligned.
        ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.blkcpy:
        prefetch [%i1+0x0], #one_read
1:
        ldda    [%i1+0x0]%asi, %l0
        ldda    [%i1+0x10]%asi, %l2
        prefetch [%i1+0x40], #one_read

        stxa    %l0, [%i0+0x0]%asi
        ldda    [%i1+0x20]%asi, %l4
        ldda    [%i1+0x30]%asi, %l6

        stxa    %l1, [%i0+0x8]%asi
        stxa    %l2, [%i0+0x10]%asi
        stxa    %l3, [%i0+0x18]%asi
        stxa    %l4, [%i0+0x20]%asi
        stxa    %l5, [%i0+0x28]%asi
        stxa    %l6, [%i0+0x30]%asi
        stxa    %l7, [%i0+0x38]%asi

        add     %i1, 0x40, %i1
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, 1b
        add     %i0, 0x40, %i0

.blkdone:
        membar  #Sync

        brz,pt  %i2, .blkexit
        nop

        ! Handle trailing bytes
        cmp     %i2, 0x8
        blu,pt  %ncc, .residue
        nop

        ! Can we do some 8B ops
        or      %i1, %i0, %o2
        andcc   %o2, 0x7, %g0
        bnz     %ncc, .last4
        nop

        ! Do 8byte ops as long as possible
.last8:
        ldx     [%i1], %o2
        stx     %o2, [%i0]
        add     %i1, 0x8, %i1
        sub     %i2, 0x8, %i2
        cmp     %i2, 0x8
        bgu,pt  %ncc, .last8
        add     %i0, 0x8, %i0

        brz,pt  %i2, .blkexit
        nop

        ba      .residue
        nop

.last4:
        ! Can we do 4B ops
        andcc   %o2, 0x3, %g0
        bnz     %ncc, .last2
        nop
1:
        ld      [%i1], %o2
        st      %o2, [%i0]
        add     %i1, 0x4, %i1
        sub     %i2, 0x4, %i2
        cmp     %i2, 0x4
        bgu,pt  %ncc, 1b
        add     %i0, 0x4, %i0

        brz,pt  %i2, .blkexit
        nop

        ba      .residue
        nop

.last2:
        ! Can we do 2B ops
        andcc   %o2, 0x1, %g0
        bnz     %ncc, .residue
        nop

1:
        lduh    [%i1], %o2
        stuh    %o2, [%i0]
        add     %i1, 0x2, %i1
        sub     %i2, 0x2, %i2
        cmp     %i2, 0x2
        bgu,pt  %ncc, 1b
        add     %i0, 0x2, %i0

        brz,pt  %i2, .blkexit
        nop

.residue:
        ldub    [%i1], %o2
        stb     %o2, [%i0]
        inc     %i1
        deccc   %i2
        bgu,pt  %ncc, .residue
        inc     %i0

.blkexit:

        membar  #Sync                           ! sync error barrier
        ! Restore t_lofault handler, if came here from kcopy().
        tst     %o5
        bz      %ncc, 1f
        andn    %o5, LOFAULT_SET, %o5
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1:
        ret
        restore %g0, 0, %o0


.bcb_punt:
        !
        ! use aligned transfers where possible
        !
        xor     %i0, %i1, %o4           ! xor from and to address
        btst    7, %o4                  ! if lower three bits zero
        bz      .aldoubcp               ! can align on double boundary
        .empty  ! assembler complaints about label

        xor     %i0, %i1, %o4           ! xor from and to address
        btst    3, %o4                  ! if lower two bits zero
        bz      .alwordcp               ! can align on word boundary
        btst    3, %i0                  ! delay slot, from address unaligned?
        !
        ! use aligned reads and writes where possible
        ! this differs from wordcp in that it copes
        ! with odd alignment between source and destnation
        ! using word reads and writes with the proper shifts
        ! in between to align transfers to and from memory
        ! i0 - src address, i1 - dest address, i2 - count
        ! i3, i4 - tmps for used generating complete word
        ! i5 (word to write)
        ! l0 size in bits of upper part of source word (US)
        ! l1 size in bits of lower part of source word (LS = 32 - US)
        ! l2 size in bits of upper part of destination word (UD)
        ! l3 size in bits of lower part of destination word (LD = 32 - UD)
        ! l4 number of bytes leftover after aligned transfers complete
        ! l5 the number 32
        !
        mov     32, %l5                 ! load an oft-needed constant
        bz      .align_dst_only
        btst    3, %i1                  ! is destnation address aligned?
        clr     %i4                     ! clear registers used in either case
        bz      .align_src_only
        clr     %l0
        !
        ! both source and destination addresses are unaligned
        !
1:                                      ! align source
        ldub    [%i0], %i3              ! read a byte from source address
        add     %i0, 1, %i0             ! increment source address
        or      %i4, %i3, %i4           ! or in with previous bytes (if any)
        btst    3, %i0                  ! is source aligned?
        add     %l0, 8, %l0             ! increment size of upper source (US)
        bnz,a   1b
        sll     %i4, 8, %i4             ! make room for next byte

        sub     %l5, %l0, %l1           ! generate shift left count (LS)
        sll     %i4, %l1, %i4           ! prepare to get rest
        ld      [%i0], %i3              ! read a word
        add     %i0, 4, %i0             ! increment source address
        srl     %i3, %l0, %i5           ! upper src bits into lower dst bits
        or      %i4, %i5, %i5           ! merge
        mov     24, %l3                 ! align destination
1:
        srl     %i5, %l3, %i4           ! prepare to write a single byte
        stb     %i4, [%i1]              ! write a byte
        add     %i1, 1, %i1             ! increment destination address
        sub     %i2, 1, %i2             ! decrement count
        btst    3, %i1                  ! is destination aligned?
        bnz,a   1b
        sub     %l3, 8, %l3             ! delay slot, decrement shift count (LD)
        sub     %l5, %l3, %l2           ! generate shift left count (UD)
        sll     %i5, %l2, %i5           ! move leftover into upper bytes
        cmp     %l2, %l0                ! cmp # reqd to fill dst w old src left
        bgu     %ncc, .more_needed      ! need more to fill than we have
        nop

        sll     %i3, %l1, %i3           ! clear upper used byte(s)
        srl     %i3, %l1, %i3
        ! get the odd bytes between alignments
        sub     %l0, %l2, %l0           ! regenerate shift count
        sub     %l5, %l0, %l1           ! generate new shift left count (LS)
        and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
        andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
        srl     %i3, %l0, %i4
        or      %i5, %i4, %i5
        st      %i5, [%i1]              ! write a word
        subcc   %i2, 4, %i2             ! decrement count
        bz      %ncc, .unalign_out
        add     %i1, 4, %i1             ! increment destination address

        b       2f
        sll     %i3, %l1, %i5           ! get leftover into upper bits
.more_needed:
        sll     %i3, %l0, %i3           ! save remaining byte(s)
        srl     %i3, %l0, %i3
        sub     %l2, %l0, %l1           ! regenerate shift count
        sub     %l5, %l1, %l0           ! generate new shift left count
        sll     %i3, %l1, %i4           ! move to fill empty space
        b       3f
        or      %i5, %i4, %i5           ! merge to complete word
        !
        ! the source address is aligned and destination is not
        !
.align_dst_only:
        ld      [%i0], %i4              ! read a word
        add     %i0, 4, %i0             ! increment source address
        mov     24, %l0                 ! initial shift alignment count
1:
        srl     %i4, %l0, %i3           ! prepare to write a single byte
        stb     %i3, [%i1]              ! write a byte
        add     %i1, 1, %i1             ! increment destination address
        sub     %i2, 1, %i2             ! decrement count
        btst    3, %i1                  ! is destination aligned?
        bnz,a   1b
        sub     %l0, 8, %l0             ! delay slot, decrement shift count
.xfer:
        sub     %l5, %l0, %l1           ! generate shift left count
        sll     %i4, %l1, %i5           ! get leftover
3:
        and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
        andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
2:
        ld      [%i0], %i3              ! read a source word
        add     %i0, 4, %i0             ! increment source address
        srl     %i3, %l0, %i4           ! upper src bits into lower dst bits
        or      %i5, %i4, %i5           ! merge with upper dest bits (leftover)
        st      %i5, [%i1]              ! write a destination word
        subcc   %i2, 4, %i2             ! decrement count
        bz      %ncc, .unalign_out      ! check if done
        add     %i1, 4, %i1             ! increment destination address
        b       2b                      ! loop
        sll     %i3, %l1, %i5           ! get leftover
.unalign_out:
        tst     %l4                     ! any bytes leftover?
        bz      %ncc, .cpdone
        .empty                          ! allow next instruction in delay slot
1:
        sub     %l0, 8, %l0             ! decrement shift
        srl     %i3, %l0, %i4           ! upper src byte into lower dst byte
        stb     %i4, [%i1]              ! write a byte
        subcc   %l4, 1, %l4             ! decrement count
        bz      %ncc, .cpdone           ! done?
        add     %i1, 1, %i1             ! increment destination
        tst     %l0                     ! any more previously read bytes
        bnz     %ncc, 1b                ! we have leftover bytes
        mov     %l4, %i2                ! delay slot, mv cnt where dbytecp wants
        b       .dbytecp                ! let dbytecp do the rest
        sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
        !
        ! the destination address is aligned and the source is not
        !
.align_src_only:
        ldub    [%i0], %i3              ! read a byte from source address
        add     %i0, 1, %i0             ! increment source address
        or      %i4, %i3, %i4           ! or in with previous bytes (if any)
        btst    3, %i0                  ! is source aligned?
        add     %l0, 8, %l0             ! increment shift count (US)
        bnz,a   .align_src_only
        sll     %i4, 8, %i4             ! make room for next byte
        b,a     .xfer
        !
        ! if from address unaligned for double-word moves,
        ! move bytes till it is, if count is < 56 it could take
        ! longer to align the thing than to do the transfer
        ! in word size chunks right away
        !
.aldoubcp:
        cmp     %i2, 56                 ! if count < 56, use wordcp, it takes
        blu,a   %ncc, .alwordcp         ! longer to align doubles than words
        mov     3, %o0                  ! mask for word alignment
        call    .alignit                ! copy bytes until aligned
        mov     7, %o0                  ! mask for double alignment
        !
        ! source and destination are now double-word aligned
        ! i3 has aligned count returned by alignit
        !
        and     %i2, 7, %i2             ! unaligned leftover count
        sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
5:
        ldx     [%i0+%i1], %o4          ! read from address
        stx     %o4, [%i1]              ! write at destination address
        subcc   %i3, 8, %i3             ! dec count
        bgu     %ncc, 5b
        add     %i1, 8, %i1             ! delay slot, inc to address
        cmp     %i2, 4                  ! see if we can copy a word
        blu     %ncc, .dbytecp          ! if 3 or less bytes use bytecp
        .empty
        !
        ! for leftover bytes we fall into wordcp, if needed
        !
.wordcp:
        and     %i2, 3, %i2             ! unaligned leftover count
5:
        ld      [%i0+%i1], %o4          ! read from address
        st      %o4, [%i1]              ! write at destination address
        subcc   %i3, 4, %i3             ! dec count
        bgu     %ncc, 5b
        add     %i1, 4, %i1             ! delay slot, inc to address
        b,a     .dbytecp

        ! we come here to align copies on word boundaries
.alwordcp:
        call    .alignit                ! go word-align it
        mov     3, %o0                  ! bits that must be zero to be aligned
        b       .wordcp
        sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst

        !
        ! byte copy, works with any alignment
        !
.bytecp:
        b       .dbytecp
        sub     %i0, %i1, %i0           ! i0 gets difference of src and dst

        !
        ! differenced byte copy, works with any alignment
        ! assumes dest in %i1 and (source - dest) in %i0
        !
1:
        stb     %o4, [%i1]              ! write to address
        inc     %i1                     ! inc to address
.dbytecp:
        deccc   %i2                     ! dec count
        bgeu,a  %ncc, 1b                ! loop till done
        ldub    [%i0+%i1], %o4          ! read from address
.cpdone:

        membar  #Sync                           ! sync error barrier
        ! Restore t_lofault handler, if came here from kcopy().
        tst     %o5
        bz      %ncc, 1f
        andn    %o5, LOFAULT_SET, %o5
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1:
        ret
        restore %g0, 0, %o0             ! return (0)

/*
 * Common code used to align transfers on word and doubleword
 * boundaries.  Aligns source and destination and returns a count
 * of aligned bytes to transfer in %i3
 */
1:
        inc     %i0                     ! inc from
        stb     %o4, [%i1]              ! write a byte
        inc     %i1                     ! inc to
        dec     %i2                     ! dec count
.alignit:
        btst    %o0, %i0                ! %o0 is bit mask to check for alignment
        bnz,a   1b
        ldub    [%i0], %o4              ! read next byte

        retl
        andn    %i2, %o0, %i3           ! return size of aligned bytes

        SET_SIZE(bcopy)

#endif  /* NIAGARA_IMPL */

/*
 * Block copy with possibly overlapped operands.
 */

        ENTRY(ovbcopy)
        tst     %o2                     ! check count
        bgu,a   %ncc, 1f                ! nothing to do or bad arguments
        subcc   %o0, %o1, %o3           ! difference of from and to address

        retl                            ! return
        nop
1:
        bneg,a  %ncc, 2f
        neg     %o3                     ! if < 0, make it positive
2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
        bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
        .empty                          !   no overlap
        cmp     %o0, %o1                ! compare from and to addresses
        blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
        nop
        !
        ! Copy forwards.
        !
.ov_fwd:
        ldub    [%o0], %o3              ! read from address
        inc     %o0                     ! inc from address
        stb     %o3, [%o1]              ! write to address
        deccc   %o2                     ! dec count
        bgu     %ncc, .ov_fwd           ! loop till done
        inc     %o1                     ! inc to address

        retl                            ! return
        nop
        !
        ! Copy backwards.
        !
.ov_bkwd:
        deccc   %o2                     ! dec count
        ldub    [%o0 + %o2], %o3        ! get byte at end of src
        bgu     %ncc, .ov_bkwd          ! loop till done
        stb     %o3, [%o1 + %o2]        ! delay slot, store at end of dst

        retl                            ! return
        nop
        SET_SIZE(ovbcopy)

/*
 * hwblkpagecopy()
 *
 * Copies exactly one page.  This routine assumes the caller (ppcopy)
 * has already disabled kernel preemption and has checked
 * use_hw_bcopy.
 */
        ENTRY(hwblkpagecopy)
        save    %sp, -SA(MINFRAME), %sp

        ! %i0 - source address (arg)
        ! %i1 - destination address (arg)
        ! %i2 - length of region (not arg)

        set     PAGESIZE, %i2

        /*
         * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
         */
        mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
        prefetch [%i0+0x0], #one_read
        prefetch [%i0+0x40], #one_read
1:
        prefetch [%i0+0x80], #one_read
        prefetch [%i0+0xc0], #one_read
        ldda    [%i0+0x0]%asi, %l0
        ldda    [%i0+0x10]%asi, %l2
        ldda    [%i0+0x20]%asi, %l4
        ldda    [%i0+0x30]%asi, %l6
        stxa    %l0, [%i1+0x0]%asi
        stxa    %l1, [%i1+0x8]%asi
        stxa    %l2, [%i1+0x10]%asi
        stxa    %l3, [%i1+0x18]%asi
        stxa    %l4, [%i1+0x20]%asi
        stxa    %l5, [%i1+0x28]%asi
        stxa    %l6, [%i1+0x30]%asi
        stxa    %l7, [%i1+0x38]%asi
        ldda    [%i0+0x40]%asi, %l0
        ldda    [%i0+0x50]%asi, %l2
        ldda    [%i0+0x60]%asi, %l4
        ldda    [%i0+0x70]%asi, %l6
        stxa    %l0, [%i1+0x40]%asi
        stxa    %l1, [%i1+0x48]%asi
        stxa    %l2, [%i1+0x50]%asi
        stxa    %l3, [%i1+0x58]%asi
        stxa    %l4, [%i1+0x60]%asi
        stxa    %l5, [%i1+0x68]%asi
        stxa    %l6, [%i1+0x70]%asi
        stxa    %l7, [%i1+0x78]%asi

        add     %i0, 0x80, %i0
        subcc   %i2, 0x80, %i2
        bgu,pt  %xcc, 1b
        add     %i1, 0x80, %i1

        membar #Sync
        ret
        restore %g0, 0, %o0
        SET_SIZE(hwblkpagecopy)


/*
 * Transfer data to and from user space -
 * Note that these routines can cause faults
 * It is assumed that the kernel has nothing at
 * less than KERNELBASE in the virtual address space.
 *
 * Note that copyin(9F) and copyout(9F) are part of the
 * DDI/DKI which specifies that they return '-1' on "errors."
 *
 * Sigh.
 *
 * So there's two extremely similar routines - xcopyin() and xcopyout()
 * which return the errno that we've faithfully computed.  This
 * allows other callers (e.g. uiomove(9F)) to work correctly.
 * Given that these are used pretty heavily, we expand the calling
 * sequences inline for all flavours (rather than making wrappers).
 *
 * There are also stub routines for xcopyout_little and xcopyin_little,
 * which currently are intended to handle requests of <= 16 bytes from
 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
 * is left as an exercise...
 */

/*
 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
 *
 * General theory of operation:
 *
 * None of the copyops routines grab a window until it's decided that
 * we need to do a HW block copy operation. This saves a window
 * spill/fill when we're called during socket ops. The typical IO
 * path won't cause spill/fill traps.
 *
 * This code uses a set of 4 limits for the maximum size that will
 * be copied given a particular input/output address alignment.
 * the default limits are:
 *
 * single byte aligned - 256 (hw_copy_limit_1)
 * two byte aligned - 512 (hw_copy_limit_2)
 * four byte aligned - 1024 (hw_copy_limit_4)
 * eight byte aligned - 1024 (hw_copy_limit_8)
 *
 * If the value for a particular limit is zero, the copy will be done
 * via the copy loops rather than block store/quad load instructions.
 *
 * Flow:
 *
 * If count == zero return zero.
 *
 * Store the previous lo_fault handler into %g6.
 * Place our secondary lofault handler into %g5.
 * Place the address of our nowindow fault handler into %o3.
 * Place the address of the windowed fault handler into %o4.
 * --> We'll use this handler if we end up grabbing a window
 * --> before we use block initializing store and quad load ASIs
 *
 * If count is less than or equal to SMALL_LIMIT (7) we
 * always do a byte for byte copy.
 *
 * If count is > SMALL_LIMIT, we check the alignment of the input
 * and output pointers. Based on the alignment we check count
 * against a limit based on detected alignment.  If we exceed the
 * alignment value we copy via block initializing store and quad
 * load instructions.
 *
 * If we don't exceed one of the limits, we store -count in %o3,
 * we store the number of chunks (8, 4, 2 or 1 byte) operated
 * on in our basic copy loop in %o2. Following this we branch
 * to the appropriate copy loop and copy that many chunks.
 * Since we've been adding the chunk size to %o3 each time through
 * as well as decrementing %o2, we can tell if any data is
 * is left to be copied by examining %o3. If that is zero, we're
 * done and can go home. If not, we figure out what the largest
 * chunk size left to be copied is and branch to that copy loop
 * unless there's only one byte left. We load that as we're
 * branching to code that stores it just before we return.
 *
 * Fault handlers are invoked if we reference memory that has no
 * current mapping.  All forms share the same copyio_fault handler.
 * This routine handles fixing up the stack and general housecleaning.
 * Each copy operation has a simple fault handler that is then called
 * to do the work specific to the invidual operation.  The handler
 * for copyOP and xcopyOP are found at the end of individual function.
 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
 */

/*
 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
 */

/*
 * We save the arguments in the following registers in case of a fault:
 *      kaddr - %g2
 *      uaddr - %g3
 *      count - %g4
 */
#define SAVE_SRC        %g2
#define SAVE_DST        %g3
#define SAVE_COUNT      %g4

#define REAL_LOFAULT            %g5
#define SAVED_LOFAULT           %g6

/*
 * Generic copyio fault handler.  This is the first line of defense when a
 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
 * This allows us to share common code for all the flavors of the copy
 * operations, including the _noerr versions.
 *
 * Note that this function will restore the original input parameters before
 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
 * member of the t_copyop structure, if needed.
 */
        ENTRY(copyio_fault)
#if !defined(NIAGARA_IMPL)
        btst    FPUSED_FLAG, SAVED_LOFAULT
        bz      1f
        andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT

        wr      %l5, 0, %gsr            ! restore gsr

        btst    FPRS_FEF, %g1
        bz      %icc, 4f
        nop

        ! restore fpregs from stack
        BLD_FP_FROMSTACK(%o2)

        ba,pt   %ncc, 1f
        nop
4:
        FZERO                           ! zero all of the fpregs
        wr      %g1, %g0, %fprs         ! restore fprs
1:
        restore
        mov     SAVE_SRC, %o0
        mov     SAVE_DST, %o1
        jmp     REAL_LOFAULT
        mov     SAVE_COUNT, %o2

#else   /* NIAGARA_IMPL */
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
        restore
        mov     SAVE_SRC, %o0
        mov     SAVE_DST, %o1
        jmp     REAL_LOFAULT
        mov     SAVE_COUNT, %o2

#endif  /* NIAGARA_IMPL */

        SET_SIZE(copyio_fault)

        ENTRY(copyio_fault_nowindow)
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault

        mov     SAVE_SRC, %o0
        mov     SAVE_DST, %o1
        jmp     REAL_LOFAULT
        mov     SAVE_COUNT, %o2
        SET_SIZE(copyio_fault_nowindow)

        ENTRY(copyout)
        sethi   %hi(.copyout_err), REAL_LOFAULT
        or      REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT

#if !defined(NIAGARA_IMPL)
.do_copyout:
        tst     %o2                     ! check for zero count;  quick exit
        bz,pt   %ncc, .co_smallqx
        mov     %o0, SAVE_SRC
        mov     %o1, SAVE_DST
        mov     %o2, SAVE_COUNT
        cmp     %o2, FP_COPY            ! check for small copy/leaf case
        bgt,pt  %ncc, .co_copy_more
        ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
/*
 * Small copy out code
 *
 */
        sethi   %hi(copyio_fault_nowindow), %o3
        or      %o3, %lo(copyio_fault_nowindow), %o3
        membar  #Sync
        stn     %o3, [THREAD_REG + T_LOFAULT]

        mov     ASI_USER, %asi
        cmp     %o2, SHORTCOPY          ! make sure there is enough to align
        ble,pt  %ncc, .co_smallest
        andcc   %o1, 0x7, %o3           ! is dest long word aligned
        bnz,pn  %ncc, .co_align
        andcc   %o1, 1, %o3             ! is dest byte aligned

! Destination is long word aligned
! 8 cases for src alignment; load parts, store long words
.co_al_src:
        andcc   %o0, 7, %o3
        brnz,pt %o3, .co_src_dst_unal8
        nop
/*
 * Special case for handling when src and dest are both long word aligned
 * and total data to move is less than FP_COPY bytes
 * Also handles finish up for large block moves, so may be less than 32 bytes
 */
.co_medlong:
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .co_medl31
        nop
.co_medl32:
        ldx     [%o0], %o4              ! move 32 bytes
        subcc   %o2, 32, %o2            ! decrement length count by 32
        stxa    %o4, [%o1]%asi
        ldx     [%o0+8], %o4
        stxa    %o4, [%o1+8]%asi
        ldx     [%o0+16], %o4
        add     %o0, 32, %o0            ! increase src ptr by 32
        stxa    %o4, [%o1+16]%asi
        ldx     [%o0-8], %o4
        add     %o1, 32, %o1            ! increase dst ptr by 32
        bgu,pt  %ncc, .co_medl32        ! repeat if at least 32 bytes left
        stxa    %o4, [%o1-8]%asi
.co_medl31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .co_medl7         ! skip if 7 or fewer bytes left
        nop
.co_medl8:
        ldx     [%o0], %o4              ! move 8 bytes
        add     %o0, 8, %o0             ! increase src ptr by 8
        subcc   %o2, 8, %o2             ! decrease count by 8
        add     %o1, 8, %o1             ! increase dst ptr by 8
        bgu,pt  %ncc, .co_medl8
        stxa    %o4, [%o1-8]%asi
.co_medl7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bnz,pt  %ncc, .co_small4        ! do final bytes if not finished

.co_smallx:                             ! finish up and exit
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
.co_smallqx:
        retl
        mov     %g0, %o0

.co_small4:
        cmp     %o2, 4
        blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
        nop                             !
        ld      [%o0], %o4              ! move 4 bytes
        add     %o0, 4, %o0             ! increase src ptr by 4
        add     %o1, 4, %o1             ! increase dst ptr by 4
        subcc   %o2, 4, %o2             ! decrease count by 4
        bz,pt   %ncc, .co_smallx
        stwa    %o4, [%o1-4]%asi

.co_small3x:                            ! Exactly 1, 2, or 3 bytes remain
        subcc   %o2, 1, %o2             ! reduce count for cc test
        ldub    [%o0], %o4              ! load one byte
        bz,pt   %ncc, .co_smallx
        stba    %o4, [%o1]%asi          ! store one byte
        ldub    [%o0+1], %o4            ! load second byte
        subcc   %o2, 1, %o2
        bz,pt   %ncc, .co_smallx
        stba    %o4, [%o1+1]%asi        ! store second byte
        ldub    [%o0+2], %o4            ! load third byte
        ba      .co_smallx
        stba    %o4, [%o1+2]%asi        ! store third byte

.co_smallest:                           ! 7 or fewer bytes remain
        cmp     %o2, 4
        blt,pt  %ncc, .co_small3x
        nop
        ldub    [%o0], %o4              ! read byte
        subcc   %o2, 4, %o2             ! reduce count by 4
        stba    %o4, [%o1]%asi          ! write byte
        ldub    [%o0+1], %o4            ! repeat for total of 4 bytes
        add     %o0, 4, %o0             ! advance src by 4
        stba    %o4, [%o1+1]%asi
        ldub    [%o0-2], %o4
        add     %o1, 4, %o1             ! advance dst by 4
        stba    %o4, [%o1-2]%asi
        ldub    [%o0-1], %o4
        bnz,pt  %ncc, .co_small3x
        stba    %o4, [%o1-1]%asi
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        mov     %g0, %o0

.co_align:                              ! byte align test in prior branch delay
        bnz,pt  %ncc, .co_al_d1
.co_al_d1f:                             ! dest is now half word aligned
        andcc   %o1, 2, %o3
        bnz,pt  %ncc, .co_al_d2
.co_al_d2f:                             ! dest is now word aligned
        andcc   %o1, 4, %o3             ! is dest longword aligned?
        bz,pt   %ncc, .co_al_src
        nop
.co_al_d4:                              ! dest is word aligned;  src is unknown
        ldub    [%o0], %o4              ! move a word (src align unknown)
        ldub    [%o0+1], %o3
        sll     %o4, 24, %o4            ! position
        sll     %o3, 16, %o3            ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%o0+2], %o4
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%o0+3], %o4
        or      %o4, %o3, %o4           ! merge
        stwa    %o4,[%o1]%asi           ! store four bytes
        add     %o0, 4, %o0             ! adjust src by 4
        add     %o1, 4, %o1             ! adjust dest by 4
        sub     %o2, 4, %o2             ! adjust count by 4
        andcc   %o0, 7, %o3             ! check for src long word alignment
        brz,pt  %o3, .co_medlong
.co_src_dst_unal8:
        ! dst is 8-byte aligned, src is not
        ! Size is less than FP_COPY
        ! Following code is to select for alignment
        andcc   %o0, 0x3, %o3           ! test word alignment
        bz,pt   %ncc, .co_medword
        nop
        andcc   %o0, 0x1, %o3           ! test halfword alignment
        bnz,pt  %ncc, .co_med_byte      ! go to byte move if not halfword
        andcc   %o0, 0x2, %o3           ! test which byte alignment
        ba      .co_medhalf
        nop
.co_al_d1:                              ! align dest to half word
        ldub    [%o0], %o4              ! move a byte
        add     %o0, 1, %o0
        stba    %o4, [%o1]%asi
        add     %o1, 1, %o1
        andcc   %o1, 2, %o3
        bz,pt   %ncc, .co_al_d2f
        sub     %o2, 1, %o2
.co_al_d2:                              ! align dest to word
        ldub    [%o0], %o4              ! move a half-word (src align unknown)
        ldub    [%o0+1], %o3
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o4           ! merge
        stha    %o4, [%o1]%asi
        add     %o0, 2, %o0
        add     %o1, 2, %o1
        andcc   %o1, 4, %o3             ! is dest longword aligned?
        bz,pt   %ncc, .co_al_src
        sub     %o2, 2, %o2
        ba      .co_al_d4
        nop
/*
 * Handle all cases where src and dest are aligned on word
 * boundaries. Use unrolled loops for better performance.
 * This option wins over standard large data move when
 * source and destination is in cache for medium
 * to short data moves.
 */
.co_medword:
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .co_medw31
        nop
.co_medw32:
        ld      [%o0], %o4              ! move a block of 32 bytes
        stwa    %o4, [%o1]%asi
        ld      [%o0+4], %o4
        stwa    %o4, [%o1+4]%asi
        ld      [%o0+8], %o4
        stwa    %o4, [%o1+8]%asi
        ld      [%o0+12], %o4
        stwa    %o4, [%o1+12]%asi
        ld      [%o0+16], %o4
        stwa    %o4, [%o1+16]%asi
        ld      [%o0+20], %o4
        subcc   %o2, 32, %o2            ! decrement length count
        stwa    %o4, [%o1+20]%asi
        ld      [%o0+24], %o4
        add     %o0, 32, %o0            ! increase src ptr by 32
        stwa    %o4, [%o1+24]%asi
        ld      [%o0-4], %o4
        add     %o1, 32, %o1            ! increase dst ptr by 32
        bgu,pt  %ncc, .co_medw32        ! repeat if at least 32 bytes left
        stwa    %o4, [%o1-4]%asi
.co_medw31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .co_medw7         ! skip if 7 or fewer bytes left
        nop                             !
.co_medw15:
        ld      [%o0], %o4              ! move a block of 8 bytes
        subcc   %o2, 8, %o2             ! decrement length count
        stwa    %o4, [%o1]%asi
        add     %o0, 8, %o0             ! increase src ptr by 8
        ld      [%o0-4], %o4
        add     %o1, 8, %o1             ! increase dst ptr by 8
        bgu,pt  %ncc, .co_medw15
        stwa    %o4, [%o1-4]%asi
.co_medw7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bz,pt   %ncc, .co_smallx        ! exit if finished
        cmp     %o2, 4
        blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
        nop                             !
        ld      [%o0], %o4              ! move 4 bytes
        add     %o0, 4, %o0             ! increase src ptr by 4
        add     %o1, 4, %o1             ! increase dst ptr by 4
        subcc   %o2, 4, %o2             ! decrease count by 4
        bnz     .co_small3x
        stwa    %o4, [%o1-4]%asi
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        mov     %g0, %o0

.co_medhalf:
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .co_medh31
        nop
.co_medh32:                             ! load and store block of 32 bytes

        lduh    [%o0], %o4              ! move 32 bytes
        subcc   %o2, 32, %o2            ! decrement length count
        lduw    [%o0+2], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduh    [%o0+6], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1]%asi

        lduh    [%o0+8], %o4
        lduw    [%o0+10], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduh    [%o0+14], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1+8]%asi

        lduh    [%o0+16], %o4
        lduw    [%o0+18], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduh    [%o0+22], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1+16]%asi

        add     %o0, 32, %o0            ! increase src ptr by 32
        add     %o1, 32, %o1            ! increase dst ptr by 32

        lduh    [%o0-8], %o4
        lduw    [%o0-6], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduh    [%o0-2], %o4
        or      %o3, %o4, %o4
        bgu,pt  %ncc, .co_medh32        ! repeat if at least 32 bytes left
        stxa    %o4, [%o1-8]%asi

.co_medh31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .co_medh7         ! skip if 7 or fewer bytes left
        nop                             !
.co_medh15:
        lduh    [%o0], %o4              ! move 16 bytes
        subcc   %o2, 8, %o2             ! decrement length count
        lduw    [%o0+2], %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        add     %o1, 8, %o1             ! increase dst ptr by 8
        lduh    [%o0+6], %o4
        add     %o0, 8, %o0             ! increase src ptr by 8
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .co_medh15
        stxa    %o4, [%o1-8]%asi
.co_medh7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bz,pt   %ncc, .co_smallx        ! exit if finished
        cmp     %o2, 4
        blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
        nop                             !
        lduh    [%o0], %o4
        sll     %o4, 16, %o4
        lduh    [%o0+2], %o3
        or      %o3, %o4, %o4
        subcc   %o2, 4, %o2
        add     %o0, 4, %o0
        add     %o1, 4, %o1
        bnz     .co_small3x
        stwa    %o4, [%o1-4]%asi
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        mov     %g0, %o0

        .align 16
.co_med_byte:
        bnz,pt  %ncc, .co_medbh32a      ! go to correct byte move
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .co_medb31
        nop
.co_medb32:                             ! Alignment 1 or 5
        subcc   %o2, 32, %o2            ! decrement length count

        ldub    [%o0], %o4              ! load and store a block of 32 bytes
        sllx    %o4, 56, %o3
        lduh    [%o0+1], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0+3], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+7], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1]%asi

        ldub    [%o0+8], %o4
        sllx    %o4, 56, %o3
        lduh    [%o0+9], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0+11], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+15], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1+8]%asi

        ldub    [%o0+16], %o4
        sllx    %o4, 56, %o3
        lduh    [%o0+17], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0+19], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+23], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1+16]%asi

        add     %o0, 32, %o0            ! increase src ptr by 32
        add     %o1, 32, %o1            ! increase dst ptr by 32

        ldub    [%o0-8], %o4
        sllx    %o4, 56, %o3
        lduh    [%o0-7], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0-5], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0-1], %o4
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .co_medb32        ! repeat if at least 32 bytes left
        stxa    %o4, [%o1-8]%asi

.co_medb31:                             ! 31 or fewer bytes remaining
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .co_medb7         ! skip if 7 or fewer bytes left
        nop                             !
.co_medb15:

        ldub    [%o0], %o4              ! load and store a block of 8 bytes
        subcc   %o2, 8, %o2             ! decrement length count
        sllx    %o4, 56, %o3
        lduh    [%o0+1], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduw    [%o0+3], %o4
        add     %o1, 8, %o1             ! increase dst ptr by 16
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+7], %o4
        add     %o0, 8, %o0             ! increase src ptr by 16
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .co_medb15
        stxa    %o4, [%o1-8]%asi
.co_medb7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bz,pt   %ncc, .co_smallx        ! exit if finished
        cmp     %o2, 4
        blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
        nop                             !
        ldub    [%o0], %o4              ! move 4 bytes
        sll     %o4, 24, %o3
        lduh    [%o0+1], %o4
        sll     %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+3], %o4
        or      %o4, %o3, %o4
        subcc   %o2, 4, %o2
        add     %o0, 4, %o0
        add     %o1, 4, %o1
        bnz     .co_small3x
        stwa    %o4, [%o1-4]%asi
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        mov     %g0, %o0

        .align 16
.co_medbh32a:
        ble,pt  %ncc, .co_medbh31
        nop
.co_medbh32:                            ! Alignment 3 or 7
        subcc   %o2, 32, %o2            ! decrement length count

        ldub    [%o0], %o4              ! load and store a block of 32 bytes
        sllx    %o4, 56, %o3
        lduw    [%o0+1], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0+5], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+7], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1]%asi

        ldub    [%o0+8], %o4
        sllx    %o4, 56, %o3
        lduw    [%o0+9], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0+13], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+15], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1+8]%asi

        ldub    [%o0+16], %o4
        sllx    %o4, 56, %o3
        lduw    [%o0+17], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0+21], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+23], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1+16]%asi

        add     %o0, 32, %o0            ! increase src ptr by 32
        add     %o1, 32, %o1            ! increase dst ptr by 32

        ldub    [%o0-8], %o4
        sllx    %o4, 56, %o3
        lduw    [%o0-7], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0-3], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0-1], %o4
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .co_medbh32       ! repeat if at least 32 bytes left
        stxa    %o4, [%o1-8]%asi

.co_medbh31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .co_medb7         ! skip if 7 or fewer bytes left
        nop                             !
.co_medbh15:
        ldub    [%o0], %o4              ! load and store a block of 8 bytes
        sllx    %o4, 56, %o3
        lduw    [%o0+1], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduh    [%o0+5], %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%o0+7], %o4
        or      %o4, %o3, %o4
        stxa    %o4, [%o1]%asi
        subcc   %o2, 8, %o2             ! decrement length count
        add     %o1, 8, %o1             ! increase dst ptr by 8
        add     %o0, 8, %o0             ! increase src ptr by 8
        bgu,pt  %ncc, .co_medbh15
        stxa    %o4, [%o1-8]%asi
        ba      .co_medb7
        nop
/*
 * End of small copy (no window) code
 */

/*
 * Long copy code
 */
.co_copy_more:
        sethi   %hi(copyio_fault), %o3
        or      %o3, %lo(copyio_fault), %o3
        membar  #Sync
        stn     %o3, [THREAD_REG + T_LOFAULT]

/*
 * Following code is for large copies. We know there is at
 * least FP_COPY bytes available. FP regs are used, so
 *  we save registers and fp regs before starting
 */
        save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
        or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
        rd      %fprs, %g1              ! check for unused fp
        ! if fprs.fef == 0, set it.
        ! Setting it when already set costs more than checking
        andcc   %g1, FPRS_FEF, %g1      ! test FEF, fprs.du = fprs.dl = 0
        bz,pt   %ncc, .co_fp_unused
        mov     ASI_USER, %asi
        BST_FP_TOSTACK(%o3)
        ba      .co_fp_ready
.co_fp_unused:
        prefetch [%i0 + (1 * CACHE_LINE)], #one_read
        wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
.co_fp_ready:
        rd      %gsr, %l5               ! save %gsr value
        andcc   %i1, 1, %o3             ! is dest byte aligned
        bnz,pt  %ncc, .co_big_d1
.co_big_d1f:                            ! dest is now half word aligned
        andcc   %i1, 2, %o3
        bnz,pt  %ncc, .co_big_d2
.co_big_d2f:                            ! dest is now word aligned
        andcc   %i1, 4, %o3             ! is dest longword aligned
        bnz,pt  %ncc, .co_big_d4
.co_big_d4f:                            ! dest is now long word aligned
        andcc   %i0, 7, %o3             ! is src long word aligned
        brnz,pt %o3, .co_big_unal8
        prefetch [%i0 + (2 * CACHE_LINE)], #one_read
        ! Src and dst are long word aligned
        ! align dst to 64 byte boundary
        andcc   %i1, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
        brz,pn  %o3, .co_al_to_64
        nop
        sub     %o3, 64, %o3            ! %o3 has negative bytes to move
        add     %i2, %o3, %i2           ! adjust remaining count
        andcc   %o3, 8, %o4             ! odd long words to move?
        brz,pt  %o4, .co_al_to_16
        nop
        add     %o3, 8, %o3
        ldx     [%i0], %o4
        add     %i0, 8, %i0             ! increment src ptr
        stxa    %o4, [%i1]ASI_USER
        add     %i1, 8, %i1             ! increment dst ptr
! Dest is aligned on 16 bytes, src 8 byte aligned
.co_al_to_16:
        andcc   %o3, 0x30, %o4          ! move to move?
        brz,pt  %o4, .co_al_to_64
        nop
.co_al_mv_16:
        add     %o3, 16, %o3
        ldx     [%i0], %o4
        stxa    %o4, [%i1]ASI_USER
        add     %i0, 16, %i0            ! increment src ptr
        ldx     [%i0-8], %o4
        add     %i1, 8, %i1             ! increment dst ptr
        stxa    %o4, [%i1]ASI_USER
        andcc   %o3, 0x30, %o4
        brnz,pt %o4, .co_al_mv_16
        add     %i1, 8, %i1             ! increment dst ptr
! Dest is aligned on 64 bytes, src 8 byte aligned
.co_al_to_64:
        ! Determine source alignment
        ! to correct 8 byte offset
        andcc   %i0, 32, %o3
        brnz,pn %o3, .co_aln_1
        andcc   %i0, 16, %o3
        brnz,pn %o3, .co_aln_01
        andcc   %i0, 8, %o3
        brz,pn  %o3, .co_aln_000
        prefetch [%i0 + (3 * CACHE_LINE)], #one_read
        ba      .co_aln_001
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.co_aln_01:
        brnz,pn %o3, .co_aln_011
        prefetch [%i0 + (3 * CACHE_LINE)], #one_read
        ba      .co_aln_010
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.co_aln_1:
        andcc   %i0, 16, %o3
        brnz,pn %o3, .co_aln_11
        andcc   %i0, 8, %o3
        brnz,pn %o3, .co_aln_101
        prefetch [%i0 + (3 * CACHE_LINE)], #one_read
        ba      .co_aln_100
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
.co_aln_11:
        brz,pn  %o3, .co_aln_110
        prefetch [%i0 + (3 * CACHE_LINE)], #one_read

.co_aln_111:
! Alignment off by 8 bytes
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        ldd     [%i0], %d0
        add     %i0, 8, %i0
        sub     %i2, 8, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.co_aln_111_loop:
        ldda    [%i0]ASI_BLK_P,%d16             ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d2
        fmovd   %d18, %d4
        fmovd   %d20, %d6
        fmovd   %d22, %d8
        fmovd   %d24, %d10
        fmovd   %d26, %d12
        fmovd   %d28, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_AIUS
        add     %i0, 64, %i0
        fmovd   %d30, %d0
        bgt,pt  %ncc, .co_aln_111_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        stda    %d0, [%i1]ASI_USER
        ba      .co_remain_stuff
        add     %i1, 8, %i1
        ! END OF aln_111

.co_aln_110:
! Alignment off by 16 bytes
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        add     %i0, 16, %i0
        sub     %i2, 16, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.co_aln_110_loop:
        ldda    [%i0]ASI_BLK_P,%d16             ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d4
        fmovd   %d18, %d6
        fmovd   %d20, %d8
        fmovd   %d22, %d10
        fmovd   %d24, %d12
        fmovd   %d26, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_AIUS
        add     %i0, 64, %i0
        fmovd   %d28, %d0
        fmovd   %d30, %d2
        bgt,pt  %ncc, .co_aln_110_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        stda    %d0, [%i1]%asi
        stda    %d2, [%i1+8]%asi
        ba      .co_remain_stuff
        add     %i1, 16, %i1
        ! END OF aln_110

.co_aln_101:
! Alignment off by 24 bytes
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16], %d4
        add     %i0, 24, %i0
        sub     %i2, 24, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.co_aln_101_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d6
        fmovd   %d18, %d8
        fmovd   %d20, %d10
        fmovd   %d22, %d12
        fmovd   %d24, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_AIUS
        add     %i0, 64, %i0
        fmovd   %d26, %d0
        fmovd   %d28, %d2
        fmovd   %d30, %d4
        bgt,pt  %ncc, .co_aln_101_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        stda    %d0, [%i1]%asi
        stda    %d2, [%i1+8]%asi
        stda    %d4, [%i1+16]%asi
        ba      .co_remain_stuff
        add     %i1, 24, %i1
        ! END OF aln_101

.co_aln_100:
! Alignment off by 32 bytes
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16],%d4
        ldd     [%i0+24],%d6
        add     %i0, 32, %i0
        sub     %i2, 32, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.co_aln_100_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d8
        fmovd   %d18, %d10
        fmovd   %d20, %d12
        fmovd   %d22, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_AIUS
        add     %i0, 64, %i0
        fmovd   %d24, %d0
        fmovd   %d26, %d2
        fmovd   %d28, %d4
        fmovd   %d30, %d6
        bgt,pt  %ncc, .co_aln_100_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        stda    %d0, [%i1]%asi
        stda    %d2, [%i1+8]%asi
        stda    %d4, [%i1+16]%asi
        stda    %d6, [%i1+24]%asi
        ba      .co_remain_stuff
        add     %i1, 32, %i1
        ! END OF aln_100

.co_aln_011:
! Alignment off by 40 bytes
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16], %d4
        ldd     [%i0+24], %d6
        ldd     [%i0+32], %d8
        add     %i0, 40, %i0
        sub     %i2, 40, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.co_aln_011_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d10
        fmovd   %d18, %d12
        fmovd   %d20, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_AIUS
        add     %i0, 64, %i0
        fmovd   %d22, %d0
        fmovd   %d24, %d2
        fmovd   %d26, %d4
        fmovd   %d28, %d6
        fmovd   %d30, %d8
        bgt,pt  %ncc, .co_aln_011_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        stda    %d0, [%i1]%asi
        stda    %d2, [%i1+8]%asi
        stda    %d4, [%i1+16]%asi
        stda    %d6, [%i1+24]%asi
        stda    %d8, [%i1+32]%asi
        ba      .co_remain_stuff
        add     %i1, 40, %i1
        ! END OF aln_011

.co_aln_010:
! Alignment off by 48 bytes
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16], %d4
        ldd     [%i0+24], %d6
        ldd     [%i0+32], %d8
        ldd     [%i0+40], %d10
        add     %i0, 48, %i0
        sub     %i2, 48, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.co_aln_010_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d12
        fmovd   %d18, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_AIUS
        add     %i0, 64, %i0
        fmovd   %d20, %d0
        fmovd   %d22, %d2
        fmovd   %d24, %d4
        fmovd   %d26, %d6
        fmovd   %d28, %d8
        fmovd   %d30, %d10
        bgt,pt  %ncc, .co_aln_010_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        stda    %d0, [%i1]%asi
        stda    %d2, [%i1+8]%asi
        stda    %d4, [%i1+16]%asi
        stda    %d6, [%i1+24]%asi
        stda    %d8, [%i1+32]%asi
        stda    %d10, [%i1+40]%asi
        ba      .co_remain_stuff
        add     %i1, 48, %i1
        ! END OF aln_010

.co_aln_001:
! Alignment off by 56 bytes
        ldd     [%i0], %d0
        ldd     [%i0+8], %d2
        ldd     [%i0+16], %d4
        ldd     [%i0+24], %d6
        ldd     [%i0+32], %d8
        ldd     [%i0+40], %d10
        ldd     [%i0+48], %d12
        add     %i0, 56, %i0
        sub     %i2, 56, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.co_aln_001_loop:
        ldda    [%i0]ASI_BLK_P,%d16     ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_AIUS
        add     %i0, 64, %i0
        fmovd   %d18, %d0
        fmovd   %d20, %d2
        fmovd   %d22, %d4
        fmovd   %d24, %d6
        fmovd   %d26, %d8
        fmovd   %d28, %d10
        fmovd   %d30, %d12
        bgt,pt  %ncc, .co_aln_001_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        stda    %d0, [%i1]%asi
        stda    %d2, [%i1+8]%asi
        stda    %d4, [%i1+16]%asi
        stda    %d6, [%i1+24]%asi
        stda    %d8, [%i1+32]%asi
        stda    %d10, [%i1+40]%asi
        stda    %d12, [%i1+48]%asi
        ba      .co_remain_stuff
        add     %i1, 56, %i1
        ! END OF aln_001

.co_aln_000:
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.co_aln_000_loop:
        ldda    [%i0]ASI_BLK_P,%d0
        subcc   %o3, 64, %o3
        stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_AIUS
        add     %i0, 64, %i0
        bgt,pt  %ncc, .co_aln_000_loop
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read
        add     %i1, %i0, %i1

        ! END OF aln_000

.co_remain_stuff:
        subcc   %i2, 31, %i2            ! adjust length to allow cc test
        ble,pt  %ncc, .co_aln_31
        nop
.co_aln_32:
        ldx     [%i0], %o4              ! move 32 bytes
        subcc   %i2, 32, %i2            ! decrement length count by 32
        stxa    %o4, [%i1]%asi
        ldx     [%i0+8], %o4
        stxa    %o4, [%i1+8]%asi
        ldx     [%i0+16], %o4
        add     %i0, 32, %i0            ! increase src ptr by 32
        stxa    %o4, [%i1+16]%asi
        ldx     [%i0-8], %o4
        add     %i1, 32, %i1            ! increase dst ptr by 32
        bgu,pt  %ncc, .co_aln_32        ! repeat if at least 32 bytes left
        stxa    %o4, [%i1-8]%asi
.co_aln_31:
        addcc   %i2, 24, %i2            ! adjust count to be off by 7
        ble,pt  %ncc, .co_aln_7         ! skip if 7 or fewer bytes left
        nop                             !
.co_aln_15:
        ldx     [%i0], %o4              ! move 8 bytes
        add     %i0, 8, %i0             ! increase src ptr by 8
        subcc   %i2, 8, %i2             ! decrease count by 8
        add     %i1, 8, %i1             ! increase dst ptr by 8
        bgu,pt  %ncc, .co_aln_15
        stxa    %o4, [%i1-8]%asi
.co_aln_7:
        addcc   %i2, 7, %i2             ! finish adjustment of remaining count
        bz,pt   %ncc, .co_exit          ! exit if finished
        cmp     %i2, 4
        blt,pt  %ncc, .co_unaln3x       ! skip if less than 4 bytes left
        nop                             !
        ld      [%i0], %o4              ! move 4 bytes
        add     %i0, 4, %i0             ! increase src ptr by 4
        add     %i1, 4, %i1             ! increase dst ptr by 4
        subcc   %i2, 4, %i2             ! decrease count by 4
        bnz     .co_unaln3x
        stwa    %o4, [%i1-4]%asi
        ba      .co_exit
        nop

        ! destination alignment code
.co_big_d1:
        ldub    [%i0], %o4              ! move a byte
        add     %i0, 1, %i0
        stba    %o4, [%i1]ASI_USER
        add     %i1, 1, %i1
        andcc   %i1, 2, %o3
        bz,pt   %ncc, .co_big_d2f
        sub     %i2, 1, %i2
.co_big_d2:
        ldub    [%i0], %o4              ! move a half-word (src align unknown)
        ldub    [%i0+1], %o3
        add     %i0, 2, %i0
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o4           ! merge
        stha    %o4, [%i1]ASI_USER
        add     %i1, 2, %i1
        andcc   %i1, 4, %o3             ! is dest longword aligned
        bz,pt   %ncc, .co_big_d4f
        sub     %i2, 2, %i2
.co_big_d4:                             ! dest is at least word aligned
        nop
        ldub    [%i0], %o4              ! move a word (src align unknown)
        ldub    [%i0+1], %o3
        sll     %o4, 24, %o4            ! position
        sll     %o3, 16, %o3            ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%i0+2], %o4
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%i0+3], %o4
        or      %o4, %o3, %o4           ! merge
        stwa    %o4,[%i1]ASI_USER       ! store four bytes
        add     %i0, 4, %i0             ! adjust src by 4
        add     %i1, 4, %i1             ! adjust dest by 4
        ba      .co_big_d4f
        sub     %i2, 4, %i2             ! adjust count by 4


        ! Dst is on 8 byte boundary; src is not;
.co_big_unal8:
        andcc   %i1, 0x3f, %o3          ! is dst 64-byte block aligned?
        bz      %ncc, .co_unalnsrc
        sub     %o3, 64, %o3            ! %o3 will be multiple of 8
        neg     %o3                     ! bytes until dest is 64 byte aligned
        sub     %i2, %o3, %i2           ! update cnt with bytes to be moved
        ! Move bytes according to source alignment
        andcc   %i0, 0x1, %o4
        bnz     %ncc, .co_unalnbyte     ! check for byte alignment
        nop
        andcc   %i0, 2, %o4             ! check for half word alignment
        bnz     %ncc, .co_unalnhalf
        nop
        ! Src is word aligned, move bytes until dest 64 byte aligned
.co_unalnword:
        ld      [%i0], %o4              ! load 4 bytes
        stwa    %o4, [%i1]%asi          ! and store 4 bytes
        ld      [%i0+4], %o4            ! load 4 bytes
        add     %i0, 8, %i0             ! increase src ptr by 8
        stwa    %o4, [%i1+4]%asi        ! and store 4 bytes
        subcc   %o3, 8, %o3             ! decrease count by 8
        bnz     %ncc, .co_unalnword
        add     %i1, 8, %i1             ! increase dst ptr by 8
        ba      .co_unalnsrc
        nop

        ! Src is half-word aligned, move bytes until dest 64 byte aligned
.co_unalnhalf:
        lduh    [%i0], %o4              ! load 2 bytes
        sllx    %o4, 32, %i3            ! shift left
        lduw    [%i0+2], %o4
        or      %o4, %i3, %i3
        sllx    %i3, 16, %i3
        lduh    [%i0+6], %o4
        or      %o4, %i3, %i3
        stxa    %i3, [%i1]ASI_USER
        add     %i0, 8, %i0
        subcc   %o3, 8, %o3
        bnz     %ncc, .co_unalnhalf
        add     %i1, 8, %i1
        ba      .co_unalnsrc
        nop

        ! Src is Byte aligned, move bytes until dest 64 byte aligned
.co_unalnbyte:
        sub     %i1, %i0, %i1           ! share pointer advance
.co_unalnbyte_loop:
        ldub    [%i0], %o4
        sllx    %o4, 56, %i3
        lduh    [%i0+1], %o4
        sllx    %o4, 40, %o4
        or      %o4, %i3, %i3
        lduh    [%i0+3], %o4
        sllx    %o4, 24, %o4
        or      %o4, %i3, %i3
        lduh    [%i0+5], %o4
        sllx    %o4, 8, %o4
        or      %o4, %i3, %i3
        ldub    [%i0+7], %o4
        or      %o4, %i3, %i3
        stxa    %i3, [%i1+%i0]ASI_USER
        subcc   %o3, 8, %o3
        bnz     %ncc, .co_unalnbyte_loop
        add     %i0, 8, %i0
        add     %i1,%i0, %i1            ! restore pointer

        ! Destination is now block (64 byte aligned), src is not 8 byte aligned
.co_unalnsrc:
        andn    %i2, 0x3f, %i3          ! %i3 is multiple of block size
        and     %i2, 0x3f, %i2          ! residue bytes in %i2
        add     %i2, 64, %i2            ! Insure we don't load beyond
        sub     %i3, 64, %i3            ! end of source buffer

        andn    %i0, 0x3f, %o4          ! %o4 has block aligned src address
        prefetch [%o4 + (3 * CACHE_LINE)], #one_read
        alignaddr %i0, %g0, %g0         ! generate %gsr
        add     %i0, %i3, %i0           ! advance %i0 to after blocks
        !
        ! Determine source alignment to correct 8 byte offset
        andcc   %i0, 0x20, %o3
        brnz,pn %o3, .co_unaln_1
        andcc   %i0, 0x10, %o3
        brnz,pn %o3, .co_unaln_01
        andcc   %i0, 0x08, %o3
        brz,a   %o3, .co_unaln_000
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_001
        nop
.co_unaln_01:
        brnz,a  %o3, .co_unaln_011
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_010
        nop
.co_unaln_1:
        brnz,pn %o3, .co_unaln_11
        andcc   %i0, 0x08, %o3
        brnz,a  %o3, .co_unaln_101
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_100
        nop
.co_unaln_11:
        brz,pn  %o3, .co_unaln_110
        prefetch [%i0 + (4 * CACHE_LINE)], #one_read

.co_unaln_111:
        ldd     [%o4+56], %d14
.co_unaln_111_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d14, %d16, %d48
        faligndata %d16, %d18, %d50
        faligndata %d18, %d20, %d52
        faligndata %d20, %d22, %d54
        faligndata %d22, %d24, %d56
        faligndata %d24, %d26, %d58
        faligndata %d26, %d28, %d60
        faligndata %d28, %d30, %d62
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_AIUS
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .co_unaln_111_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_done
        nop

.co_unaln_110:
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.co_unaln_110_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d12, %d14, %d48
        faligndata %d14, %d16, %d50
        faligndata %d16, %d18, %d52
        faligndata %d18, %d20, %d54
        faligndata %d20, %d22, %d56
        faligndata %d22, %d24, %d58
        faligndata %d24, %d26, %d60
        faligndata %d26, %d28, %d62
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_AIUS
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .co_unaln_110_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_done
        nop

.co_unaln_101:
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.co_unaln_101_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d10, %d12, %d48
        faligndata %d12, %d14, %d50
        faligndata %d14, %d16, %d52
        faligndata %d16, %d18, %d54
        faligndata %d18, %d20, %d56
        faligndata %d20, %d22, %d58
        faligndata %d22, %d24, %d60
        faligndata %d24, %d26, %d62
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_AIUS
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .co_unaln_101_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_done
        nop

.co_unaln_100:
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.co_unaln_100_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d8, %d10, %d48
        faligndata %d10, %d12, %d50
        faligndata %d12, %d14, %d52
        faligndata %d14, %d16, %d54
        faligndata %d16, %d18, %d56
        faligndata %d18, %d20, %d58
        faligndata %d20, %d22, %d60
        faligndata %d22, %d24, %d62
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_AIUS
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .co_unaln_100_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_done
        nop

.co_unaln_011:
        ldd     [%o4+24], %d6
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.co_unaln_011_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d6, %d8, %d48
        faligndata %d8, %d10, %d50
        faligndata %d10, %d12, %d52
        faligndata %d12, %d14, %d54
        faligndata %d14, %d16, %d56
        faligndata %d16, %d18, %d58
        faligndata %d18, %d20, %d60
        faligndata %d20, %d22, %d62
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_AIUS
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .co_unaln_011_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_done
        nop

.co_unaln_010:
        ldd     [%o4+16], %d4
        ldd     [%o4+24], %d6
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.co_unaln_010_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d4, %d6, %d48
        faligndata %d6, %d8, %d50
        faligndata %d8, %d10, %d52
        faligndata %d10, %d12, %d54
        faligndata %d12, %d14, %d56
        faligndata %d14, %d16, %d58
        faligndata %d16, %d18, %d60
        faligndata %d18, %d20, %d62
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_AIUS
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .co_unaln_010_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_done
        nop

.co_unaln_001:
        ldd     [%o4+8], %d2
        ldd     [%o4+16], %d4
        ldd     [%o4+24], %d6
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.co_unaln_001_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d2, %d4, %d48
        faligndata %d4, %d6, %d50
        faligndata %d6, %d8, %d52
        faligndata %d8, %d10, %d54
        faligndata %d10, %d12, %d56
        faligndata %d12, %d14, %d58
        faligndata %d14, %d16, %d60
        faligndata %d16, %d18, %d62
        fmovd   %d18, %d2
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_AIUS
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .co_unaln_001_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read
        ba      .co_unaln_done
        nop

.co_unaln_000:
        ldda    [%o4]ASI_BLK_P, %d0
.co_unaln_000_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d0, %d2, %d48
        faligndata %d2, %d4, %d50
        faligndata %d4, %d6, %d52
        faligndata %d6, %d8, %d54
        faligndata %d8, %d10, %d56
        faligndata %d10, %d12, %d58
        faligndata %d12, %d14, %d60
        faligndata %d14, %d16, %d62
        fmovd   %d16, %d0
        fmovd   %d18, %d2
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_AIUS
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .co_unaln_000_loop
        prefetch [%o4 + (4 * CACHE_LINE)], #one_read

.co_unaln_done:
        ! Handle trailing bytes, 64 to 127
        ! Dest long word aligned, Src not long word aligned
        cmp     %i2, 15
        bleu    %ncc, .co_unaln_short

        andn    %i2, 0x7, %i3           ! %i3 is multiple of 8
        and     %i2, 0x7, %i2           ! residue bytes in %i2
        add     %i2, 8, %i2
        sub     %i3, 8, %i3             ! insure we don't load past end of src
        andn    %i0, 0x7, %o4           ! %o4 has long word aligned src address
        add     %i0, %i3, %i0           ! advance %i0 to after multiple of 8
        ldd     [%o4], %d0              ! fetch partial word
.co_unaln_by8:
        ldd     [%o4+8], %d2
        add     %o4, 8, %o4
        faligndata %d0, %d2, %d16
        subcc   %i3, 8, %i3
        stda    %d16, [%i1]%asi
        fmovd   %d2, %d0
        bgu,pt  %ncc, .co_unaln_by8
        add     %i1, 8, %i1

.co_unaln_short:
        cmp     %i2, 8
        blt,pt  %ncc, .co_unalnfin
        nop
        ldub    [%i0], %o4
        sll     %o4, 24, %o3
        ldub    [%i0+1], %o4
        sll     %o4, 16, %o4
        or      %o4, %o3, %o3
        ldub    [%i0+2], %o4
        sll     %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%i0+3], %o4
        or      %o4, %o3, %o3
        stwa    %o3, [%i1]%asi
        ldub    [%i0+4], %o4
        sll     %o4, 24, %o3
        ldub    [%i0+5], %o4
        sll     %o4, 16, %o4
        or      %o4, %o3, %o3
        ldub    [%i0+6], %o4
        sll     %o4, 8, %o4
        or      %o4, %o3, %o3
        ldub    [%i0+7], %o4
        or      %o4, %o3, %o3
        stwa    %o3, [%i1+4]%asi
        add     %i0, 8, %i0
        add     %i1, 8, %i1
        sub     %i2, 8, %i2
.co_unalnfin:
        cmp     %i2, 4
        blt,pt  %ncc, .co_unalnz
        tst     %i2
        ldub    [%i0], %o3              ! read byte
        subcc   %i2, 4, %i2             ! reduce count by 4
        sll     %o3, 24, %o3            ! position
        ldub    [%i0+1], %o4
        sll     %o4, 16, %o4            ! position
        or      %o4, %o3, %o3           ! merge
        ldub    [%i0+2], %o4
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o3           ! merge
        add     %i1, 4, %i1             ! advance dst by 4
        ldub    [%i0+3], %o4
        add     %i0, 4, %i0             ! advance src by 4
        or      %o4, %o3, %o4           ! merge
        bnz,pt  %ncc, .co_unaln3x
        stwa    %o4, [%i1-4]%asi
        ba      .co_exit
        nop
.co_unalnz:
        bz,pt   %ncc, .co_exit
        wr      %l5, %g0, %gsr          ! restore %gsr
.co_unaln3x:                            ! Exactly 1, 2, or 3 bytes remain
        subcc   %i2, 1, %i2             ! reduce count for cc test
        ldub    [%i0], %o4              ! load one byte
        bz,pt   %ncc, .co_exit
        stba    %o4, [%i1]%asi          ! store one byte
        ldub    [%i0+1], %o4            ! load second byte
        subcc   %i2, 1, %i2
        bz,pt   %ncc, .co_exit
        stba    %o4, [%i1+1]%asi        ! store second byte
        ldub    [%i0+2], %o4            ! load third byte
        stba    %o4, [%i1+2]%asi        ! store third byte
.co_exit:
        brnz    %g1, .co_fp_restore
        nop
        FZERO
        wr      %g1, %g0, %fprs
        ba,pt   %ncc, .co_ex2
        membar  #Sync
.co_fp_restore:
        BLD_FP_FROMSTACK(%o4)
.co_ex2:
        andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
        ret
        restore %g0, 0, %o0

.copyout_err:
        ldn     [THREAD_REG + T_COPYOPS], %o4
        brz     %o4, 2f
        nop
        ldn     [%o4 + CP_COPYOUT], %g2
        jmp     %g2
        nop
2:
        retl
        mov     -1, %o0

#else   /* NIAGARA_IMPL */
.do_copyout:
        !
        ! Check the length and bail if zero.
        !
        tst     %o2
        bnz,pt  %ncc, 1f
        nop
        retl
        clr     %o0
1:
        sethi   %hi(copyio_fault), %o4
        or      %o4, %lo(copyio_fault), %o4
        sethi   %hi(copyio_fault_nowindow), %o3
        ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
        or      %o3, %lo(copyio_fault_nowindow), %o3
        membar  #Sync
        stn     %o3, [THREAD_REG + T_LOFAULT]

        mov     %o0, SAVE_SRC
        mov     %o1, SAVE_DST
        mov     %o2, SAVE_COUNT

        !
        ! Check to see if we're more than SMALL_LIMIT (7 bytes).
        ! Run in leaf mode, using the %o regs as our input regs.
        !
        subcc   %o2, SMALL_LIMIT, %o3
        bgu,a,pt %ncc, .dco_ns
        or      %o0, %o1, %o3
        !
        ! What was previously ".small_copyout"
        ! Do full differenced copy.
        !
.dcobcp:
        sub     %g0, %o2, %o3           ! negate count
        add     %o0, %o2, %o0           ! make %o0 point at the end
        add     %o1, %o2, %o1           ! make %o1 point at the end
        ba,pt   %ncc, .dcocl
        ldub    [%o0 + %o3], %o4        ! load first byte
        !
        ! %o0 and %o2 point at the end and remain pointing at the end
        ! of their buffers. We pull things out by adding %o3 (which is
        ! the negation of the length) to the buffer end which gives us
        ! the curent location in the buffers. By incrementing %o3 we walk
        ! through both buffers without having to bump each buffer's
        ! pointer. A very fast 4 instruction loop.
        !
        .align 16
.dcocl:
        stba    %o4, [%o1 + %o3]ASI_USER
        inccc   %o3
        bl,a,pt %ncc, .dcocl
        ldub    [%o0 + %o3], %o4
        !
        ! We're done. Go home.
        !
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        clr     %o0
        !
        ! Try aligned copies from here.
        !
.dco_ns:
        ! %o0 = kernel addr (to be copied from)
        ! %o1 = user addr (to be copied to)
        ! %o2 = length
        ! %o3 = %o1 | %o2 (used for alignment checking)
        ! %o4 is alternate lo_fault
        ! %o5 is original lo_fault
        !
        ! See if we're single byte aligned. If we are, check the
        ! limit for single byte copies. If we're smaller or equal,
        ! bounce to the byte for byte copy loop. Otherwise do it in
        ! HW (if enabled).
        !
        btst    1, %o3
        bz,pt   %icc, .dcoh8
        btst    7, %o3
        !
        ! Single byte aligned. Do we do it via HW or via
        ! byte for byte? Do a quick no memory reference
        ! check to pick up small copies.
        !
        sethi   %hi(hw_copy_limit_1), %o3
        !
        ! Big enough that we need to check the HW limit for
        ! this size copy.
        !
        ld      [%o3 + %lo(hw_copy_limit_1)], %o3
        !
        ! Is HW copy on? If not, do everything byte for byte.
        !
        tst     %o3
        bz,pn   %icc, .dcobcp
        subcc   %o3, %o2, %o3
        !
        ! If we're less than or equal to the single byte copy limit,
        ! bop to the copy loop.
        !
        bge,pt  %ncc, .dcobcp
        nop
        !
        ! We're big enough and copy is on. Do it with HW.
        !
        ba,pt   %ncc, .big_copyout
        nop
.dcoh8:
        !
        ! 8 byte aligned?
        !
        bnz,a   %ncc, .dcoh4
        btst    3, %o3
        !
        ! See if we're in the "small range".
        ! If so, go off and do the copy.
        ! If not, load the hard limit. %o3 is
        ! available for reuse.
        !
        sethi   %hi(hw_copy_limit_8), %o3
        ld      [%o3 + %lo(hw_copy_limit_8)], %o3
        !
        ! If it's zero, there's no HW bcopy.
        ! Bop off to the aligned copy.
        !
        tst     %o3
        bz,pn   %icc, .dcos8
        subcc   %o3, %o2, %o3
        !
        ! We're negative if our size is larger than hw_copy_limit_8.
        !
        bge,pt  %ncc, .dcos8
        nop
        !
        ! HW assist is on and we're large enough. Do it.
        !
        ba,pt   %ncc, .big_copyout
        nop
.dcos8:
        !
        ! Housekeeping for copy loops. Uses same idea as in the byte for
        ! byte copy loop above.
        !
        add     %o0, %o2, %o0
        add     %o1, %o2, %o1
        sub     %g0, %o2, %o3
        ba,pt   %ncc, .dodebc
        srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
        !
        ! 4 byte aligned?
        !
.dcoh4:
        bnz,pn  %ncc, .dcoh2
        !
        ! See if we're in the "small range".
        ! If so, go off an do the copy.
        ! If not, load the hard limit. %o3 is
        ! available for reuse.
        !
        sethi   %hi(hw_copy_limit_4), %o3
        ld      [%o3 + %lo(hw_copy_limit_4)], %o3
        !
        ! If it's zero, there's no HW bcopy.
        ! Bop off to the aligned copy.
        !
        tst     %o3
        bz,pn   %icc, .dcos4
        subcc   %o3, %o2, %o3
        !
        ! We're negative if our size is larger than hw_copy_limit_4.
        !
        bge,pt  %ncc, .dcos4
        nop
        !
        ! HW assist is on and we're large enough. Do it.
        !
        ba,pt   %ncc, .big_copyout
        nop
.dcos4:
        add     %o0, %o2, %o0
        add     %o1, %o2, %o1
        sub     %g0, %o2, %o3
        ba,pt   %ncc, .dodfbc
        srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
        !
        ! We must be 2 byte aligned. Off we go.
        ! The check for small copies was done in the
        ! delay at .dcoh4
        !
.dcoh2:
        ble     %ncc, .dcos2
        sethi   %hi(hw_copy_limit_2), %o3
        ld      [%o3 + %lo(hw_copy_limit_2)], %o3
        tst     %o3
        bz,pn   %icc, .dcos2
        subcc   %o3, %o2, %o3
        bge,pt  %ncc, .dcos2
        nop
        !
        ! HW is on and we're big enough. Do it.
        !
        ba,pt   %ncc, .big_copyout
        nop
.dcos2:
        add     %o0, %o2, %o0
        add     %o1, %o2, %o1
        sub     %g0, %o2, %o3
        ba,pt   %ncc, .dodtbc
        srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
.small_copyout:
        !
        ! Why are we doing this AGAIN? There are certain conditions in
        ! big_copyout that will cause us to forego the HW assisted copies
        ! and bounce back to a non-HW assisted copy. This dispatches those
        ! copies. Note that we branch around this in the main line code.
        !
        ! We make no check for limits or HW enablement here. We've
        ! already been told that we're a poster child so just go off
        ! and do it.
        !
        or      %o0, %o1, %o3
        btst    1, %o3
        bnz     %icc, .dcobcp           ! Most likely
        btst    7, %o3
        bz      %icc, .dcos8
        btst    3, %o3
        bz      %icc, .dcos4
        nop
        ba,pt   %ncc, .dcos2
        nop
        .align 32
.dodebc:
        ldx     [%o0 + %o3], %o4
        deccc   %o2
        stxa    %o4, [%o1 + %o3]ASI_USER
        bg,pt   %ncc, .dodebc
        addcc   %o3, 8, %o3
        !
        ! End of copy loop. Check to see if we're done. Most
        ! eight byte aligned copies end here.
        !
        bz,pt   %ncc, .dcofh
        nop
        !
        ! Something is left - do it byte for byte.
        !
        ba,pt   %ncc, .dcocl
        ldub    [%o0 + %o3], %o4        ! load next byte
        !
        ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
        !
        .align 32
.dodfbc:
        lduw    [%o0 + %o3], %o4
        deccc   %o2
        sta     %o4, [%o1 + %o3]ASI_USER
        bg,pt   %ncc, .dodfbc
        addcc   %o3, 4, %o3
        !
        ! End of copy loop. Check to see if we're done. Most
        ! four byte aligned copies end here.
        !
        bz,pt   %ncc, .dcofh
        nop
        !
        ! Something is left. Do it byte for byte.
        !
        ba,pt   %ncc, .dcocl
        ldub    [%o0 + %o3], %o4        ! load next byte
        !
        ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
        ! copy.
        !
        .align 32
.dodtbc:
        lduh    [%o0 + %o3], %o4
        deccc   %o2
        stha    %o4, [%o1 + %o3]ASI_USER
        bg,pt   %ncc, .dodtbc
        addcc   %o3, 2, %o3
        !
        ! End of copy loop. Anything left?
        !
        bz,pt   %ncc, .dcofh
        nop
        !
        ! Deal with the last byte
        !
        ldub    [%o0 + %o3], %o4
        stba    %o4, [%o1 + %o3]ASI_USER
.dcofh:
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
        retl
        clr     %o0

.big_copyout:
        ! We're going to go off and do a block copy.
        ! Switch fault handlers and grab a window. We
        ! don't do a membar #Sync since we've done only
        ! kernel data to this point.
        stn     %o4, [THREAD_REG + T_LOFAULT]

        ! Copy out that reach here are larger than 256 bytes. The
        ! hw_copy_limit_1 is set to 256. Never set this limit less
        ! 128 bytes.
        save    %sp, -SA(MINFRAME), %sp
.do_block_copyout:

        ! Swap src/dst since the code below is memcpy code
        ! and memcpy/bcopy have different calling sequences
        mov     %i1, %i5
        mov     %i0, %i1
        mov     %i5, %i0

        ! Block (64 bytes) align the destination.
        andcc   %i0, 0x3f, %i3          ! is dst block aligned
        bz      %ncc, copyout_blalign   ! dst already block aligned
        sub     %i3, 0x40, %i3
        neg     %i3                     ! bytes till dst 64 bytes aligned
        sub     %i2, %i3, %i2           ! update i2 with new count

        ! Based on source and destination alignment do
        ! either 8 bytes, 4 bytes, 2 bytes or byte copy.

        ! Is dst & src 8B aligned
        or      %i0, %i1, %o2
        andcc   %o2, 0x7, %g0
        bz      %ncc, .co_alewdcp
        nop

        ! Is dst & src 4B aligned
        andcc   %o2, 0x3, %g0
        bz      %ncc, .co_alwdcp
        nop

        ! Is dst & src 2B aligned
        andcc   %o2, 0x1, %g0
        bz      %ncc, .co_alhlfwdcp
        nop

        ! 1B aligned
1:      ldub    [%i1], %o2
        stba    %o2, [%i0]ASI_USER
        inc     %i1
        deccc   %i3
        bgu,pt  %ncc, 1b
        inc     %i0

        ba      copyout_blalign
        nop

        ! dst & src 4B aligned
.co_alwdcp:
        ld      [%i1], %o2
        sta     %o2, [%i0]ASI_USER
        add     %i1, 0x4, %i1
        subcc   %i3, 0x4, %i3
        bgu,pt  %ncc, .co_alwdcp
        add     %i0, 0x4, %i0

        ba      copyout_blalign
        nop

        ! dst & src 2B aligned
.co_alhlfwdcp:
        lduh    [%i1], %o2
        stuha   %o2, [%i0]ASI_USER
        add     %i1, 0x2, %i1
        subcc   %i3, 0x2, %i3
        bgu,pt  %ncc, .co_alhlfwdcp
        add     %i0, 0x2, %i0

        ba      copyout_blalign
        nop

        ! dst & src 8B aligned
.co_alewdcp:
        ldx     [%i1], %o2
        stxa    %o2, [%i0]ASI_USER
        add     %i1, 0x8, %i1
        subcc   %i3, 0x8, %i3
        bgu,pt  %ncc, .co_alewdcp
        add     %i0, 0x8, %i0

        ! Now Destination is block (64 bytes) aligned
copyout_blalign:
        andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
        sub     %i2, %i3, %i2           ! Residue bytes in %i2

        mov     ASI_BLK_INIT_QUAD_LDD_AIUS, %asi

        andcc   %i1, 0xf, %o2           ! is src quadword aligned
        bz,pn   %xcc, .co_blkcpy        ! src offset in %o2 (last 4-bits)
        nop
        cmp     %o2, 0x8
        bg      .co_upper_double
        nop
        bl      .co_lower_double
        nop

        ! Falls through when source offset is equal to 8 i.e.
        ! source is double word aligned.
        ! In this case no shift/merge of data is required

        sub     %i1, %o2, %i1           ! align the src at 16 bytes.
        andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
        prefetch [%l0+0x0], #one_read
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
.co_loop0:
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
        prefetch [%l0+0x40], #one_read

        stxa    %l3, [%i0+0x0]%asi
        stxa    %l4, [%i0+0x8]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2

        stxa    %l5, [%i0+0x10]%asi
        stxa    %l2, [%i0+0x18]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4

        stxa    %l3, [%i0+0x20]%asi
        stxa    %l4, [%i0+0x28]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2

        stxa    %l5, [%i0+0x30]%asi
        stxa    %l2, [%i0+0x38]%asi

        add     %l0, 0x40, %l0
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, .co_loop0
        add     %i0, 0x40, %i0
        ba      .co_blkdone
        add     %i1, %o2, %i1           ! increment the source by src offset
                                        ! the src offset was stored in %o2

.co_lower_double:

        sub     %i1, %o2, %i1           ! align the src at 16 bytes.
        sll     %o2, 3, %o0             ! %o0 left shift
        mov     0x40, %o1
        sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
        andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
        prefetch [%l0+0x0], #one_read
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2    ! partial data in %l2 and %l3 has
                                        ! complete data
.co_loop1:
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4    ! %l4 has partial data
                                                        ! for this read.
        ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
                                                        ! into %l2 and %l3
        prefetch [%l0+0x40], #one_read

        stxa    %l2, [%i0+0x0]%asi
        stxa    %l3, [%i0+0x8]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
        ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
                                                        ! %l4 from previous read
                                                        ! into %l4 and %l5
        stxa    %l4, [%i0+0x10]%asi
        stxa    %l5, [%i0+0x18]%asi

        ! Repeat the same for next 32 bytes.

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
        ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)

        stxa    %l2, [%i0+0x20]%asi
        stxa    %l3, [%i0+0x28]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
        ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)

        stxa    %l4, [%i0+0x30]%asi
        stxa    %l5, [%i0+0x38]%asi

        add     %l0, 0x40, %l0
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, .co_loop1
        add     %i0, 0x40, %i0
        ba      .co_blkdone
        add     %i1, %o2, %i1           ! increment the source by src offset
                                        ! the src offset was stored in %o2

.co_upper_double:

        sub     %i1, %o2, %i1           ! align the src at 16 bytes.
        sub     %o2, 0x8, %o0
        sll     %o0, 3, %o0             ! %o0 left shift
        mov     0x40, %o1
        sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
        andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
        prefetch [%l0+0x0], #one_read
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2    ! partial data in %l3
                                                        ! for this read and
                                                        ! no data in %l2
.co_loop2:
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4    ! %l4 has complete data
                                                        ! and %l5 has partial
        ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
                                                        ! into %l3 and %l4
        prefetch [%l0+0x40], #one_read

        stxa    %l3, [%i0+0x0]%asi
        stxa    %l4, [%i0+0x8]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
        ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
                                                        ! %l5 from previous read
                                                        ! into %l5 and %l2

        stxa    %l5, [%i0+0x10]%asi
        stxa    %l2, [%i0+0x18]%asi

        ! Repeat the same for next 32 bytes.

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
        ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)

        stxa    %l3, [%i0+0x20]%asi
        stxa    %l4, [%i0+0x28]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
        ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)

        stxa    %l5, [%i0+0x30]%asi
        stxa    %l2, [%i0+0x38]%asi

        add     %l0, 0x40, %l0
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, .co_loop2
        add     %i0, 0x40, %i0
        ba      .co_blkdone
        add     %i1, %o2, %i1           ! increment the source by src offset
                                        ! the src offset was stored in %o2


        ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.co_blkcpy:

        andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
        prefetch [%o0+0x0], #one_read
1:
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
        add     %i1, 0x10, %i1

        prefetch [%o0+0x40], #one_read

        stxa    %l0, [%i0+0x0]%asi

        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
        add     %i1, 0x10, %i1

        stxa    %l1, [%i0+0x8]%asi
        stxa    %l2, [%i0+0x10]%asi
        stxa    %l3, [%i0+0x18]%asi
        stxa    %l4, [%i0+0x20]%asi
        stxa    %l5, [%i0+0x28]%asi
        stxa    %l6, [%i0+0x30]%asi
        stxa    %l7, [%i0+0x38]%asi

        add     %o0, 0x40, %o0
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, 1b
        add     %i0, 0x40, %i0

.co_blkdone:
        membar  #Sync

        brz,pt  %i2, .copyout_exit
        nop

        ! Handle trailing bytes
        cmp     %i2, 0x8
        blu,pt  %ncc, .co_residue
        nop

        ! Can we do some 8B ops
        or      %i1, %i0, %o2
        andcc   %o2, 0x7, %g0
        bnz     %ncc, .co_last4
        nop

        ! Do 8byte ops as long as possible
.co_last8:
        ldx     [%i1], %o2
        stxa    %o2, [%i0]ASI_USER
        add     %i1, 0x8, %i1
        sub     %i2, 0x8, %i2
        cmp     %i2, 0x8
        bgu,pt  %ncc, .co_last8
        add     %i0, 0x8, %i0

        brz,pt  %i2, .copyout_exit
        nop

        ba      .co_residue
        nop

.co_last4:
        ! Can we do 4B ops
        andcc   %o2, 0x3, %g0
        bnz     %ncc, .co_last2
        nop
1:
        ld      [%i1], %o2
        sta     %o2, [%i0]ASI_USER
        add     %i1, 0x4, %i1
        sub     %i2, 0x4, %i2
        cmp     %i2, 0x4
        bgu,pt  %ncc, 1b
        add     %i0, 0x4, %i0

        brz,pt  %i2, .copyout_exit
        nop

        ba      .co_residue
        nop

.co_last2:
        ! Can we do 2B ops
        andcc   %o2, 0x1, %g0
        bnz     %ncc, .co_residue
        nop

1:
        lduh    [%i1], %o2
        stuha   %o2, [%i0]ASI_USER
        add     %i1, 0x2, %i1
        sub     %i2, 0x2, %i2
        cmp     %i2, 0x2
        bgu,pt  %ncc, 1b
        add     %i0, 0x2, %i0

        brz,pt  %i2, .copyout_exit
        nop

        ! Copy the residue as byte copy
.co_residue:
        ldub    [%i1], %i4
        stba    %i4, [%i0]ASI_USER
        inc     %i1
        deccc   %i2
        bgu,pt  %xcc, .co_residue
        inc     %i0

.copyout_exit:
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
        ret
        restore %g0, 0, %o0

.copyout_err:
        ldn     [THREAD_REG + T_COPYOPS], %o4
        brz     %o4, 2f
        nop
        ldn     [%o4 + CP_COPYOUT], %g2
        jmp     %g2
        nop
2:
        retl
        mov     -1, %o0
#endif  /* NIAGARA_IMPL */
        SET_SIZE(copyout)


        ENTRY(xcopyout)
        sethi   %hi(.xcopyout_err), REAL_LOFAULT
        b       .do_copyout
        or      REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
.xcopyout_err:
        ldn     [THREAD_REG + T_COPYOPS], %o4
        brz     %o4, 2f
        nop
        ldn     [%o4 + CP_XCOPYOUT], %g2
        jmp     %g2
        nop
2:
        retl
        mov     %g1, %o0
        SET_SIZE(xcopyout)

        ENTRY(xcopyout_little)
        sethi   %hi(.little_err), %o4
        ldn     [THREAD_REG + T_LOFAULT], %o5
        or      %o4, %lo(.little_err), %o4
        membar  #Sync                   ! sync error barrier
        stn     %o4, [THREAD_REG + T_LOFAULT]

        subcc   %g0, %o2, %o3
        add     %o0, %o2, %o0
        bz,pn   %ncc, 2f                ! check for zero bytes
        sub     %o2, 1, %o4
        add     %o0, %o4, %o0           ! start w/last byte
        add     %o1, %o2, %o1
        ldub    [%o0+%o3], %o4

1:      stba    %o4, [%o1+%o3]ASI_AIUSL
        inccc   %o3
        sub     %o0, 2, %o0             ! get next byte
        bcc,a,pt %ncc, 1b
        ldub    [%o0+%o3], %o4

2:      membar  #Sync                   ! sync error barrier
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
        retl
        mov     %g0, %o0                ! return (0)
        SET_SIZE(xcopyout_little)

/*
 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
 */

        ENTRY(copyin)
        sethi   %hi(.copyin_err), REAL_LOFAULT
        or      REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT

#if !defined(NIAGARA_IMPL)
.do_copyin:
        tst     %o2                     ! check for zero count;  quick exit
        bz,pt   %ncc, .ci_smallqx
        mov     %o0, SAVE_SRC
        mov     %o1, SAVE_DST
        mov     %o2, SAVE_COUNT
        cmp     %o2, FP_COPY            ! check for small copy/leaf case
        bgt,pt  %ncc, .ci_copy_more
        ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
/*
 * Small copy in code
 *
 */
        sethi   %hi(copyio_fault_nowindow), %o3
        or      %o3, %lo(copyio_fault_nowindow), %o3
        membar  #Sync
        stn     %o3, [THREAD_REG + T_LOFAULT]

        mov     ASI_USER, %asi
        cmp     %o2, SHORTCOPY          ! make sure there is enough to align
        ble,pt  %ncc, .ci_smallest
        andcc   %o1, 0x7, %o3           ! is dest long word aligned
        bnz,pn  %ncc, .ci_align
        andcc   %o1, 1, %o3             ! is dest byte aligned

! Destination is long word aligned
.ci_al_src:
        andcc   %o0, 7, %o3
        brnz,pt %o3, .ci_src_dst_unal8
        nop
/*
 * Special case for handling when src and dest are both long word aligned
 * and total data to move is less than FP_COPY bytes
 * Also handles finish up for large block moves, so may be less than 32 bytes
 */
.ci_medlong:
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .ci_medl31
        nop
.ci_medl32:
        ldxa    [%o0]%asi, %o4          ! move 32 bytes
        subcc   %o2, 32, %o2            ! decrement length count by 32
        stx     %o4, [%o1]
        ldxa    [%o0+8]%asi, %o4
        stx     %o4, [%o1+8]
        ldxa    [%o0+16]%asi, %o4
        add     %o0, 32, %o0            ! increase src ptr by 32
        stx     %o4, [%o1+16]
        ldxa    [%o0-8]%asi, %o4
        add     %o1, 32, %o1            ! increase dst ptr by 32
        bgu,pt  %ncc, .ci_medl32        ! repeat if at least 32 bytes left
        stx     %o4, [%o1-8]
.ci_medl31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .ci_medl7         ! skip if 7 or fewer bytes left
        nop
.ci_medl8:
        ldxa    [%o0]%asi, %o4          ! move 8 bytes
        add     %o0, 8, %o0             ! increase src ptr by 8
        subcc   %o2, 8, %o2             ! decrease count by 8
        add     %o1, 8, %o1             ! increase dst ptr by 8
        bgu,pt  %ncc, .ci_medl8
        stx     %o4, [%o1-8]
.ci_medl7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bnz,pt  %ncc, .ci_small4        ! do final bytes if not finished
        nop
.ci_smallx:                             ! finish up and exit
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
.ci_smallqx:
        retl
        mov     %g0, %o0

.ci_small4:
        cmp     %o2, 4
        blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
        nop                             !
        lda     [%o0]%asi, %o4          ! move 4 bytes
        add     %o0, 4, %o0             ! increase src ptr by 4
        add     %o1, 4, %o1             ! increase dst ptr by 4
        subcc   %o2, 4, %o2             ! decrease count by 4
        bz      %ncc, .ci_smallx
        stw     %o4, [%o1-4]

.ci_small3x:                            ! Exactly 1, 2, or 3 bytes remain
        subcc   %o2, 1, %o2             ! reduce count for cc test
        lduba   [%o0]%asi, %o4          ! load one byte
        bz,pt   %ncc, .ci_smallx
        stb     %o4, [%o1]              ! store one byte
        lduba   [%o0+1]%asi, %o4        ! load second byte
        subcc   %o2, 1, %o2
        bz,pt   %ncc, .ci_smallx
        stb     %o4, [%o1+1]            ! store second byte
        lduba   [%o0+2]%asi, %o4        ! load third byte
        ba      .ci_smallx
        stb     %o4, [%o1+2]            ! store third byte

.ci_smallest:                           ! 7 or fewer bytes remain
        cmp     %o2, 4
        blt,pt  %ncc, .ci_small3x
        nop
        lduba   [%o0]%asi, %o4          ! read byte
        subcc   %o2, 4, %o2             ! reduce count by 4
        stb     %o4, [%o1]              ! write byte
        lduba   [%o0+1]%asi, %o4        ! repeat for total of 4 bytes
        add     %o0, 4, %o0             ! advance src by 4
        stb     %o4, [%o1+1]
        lduba   [%o0-2]%asi, %o4
        add     %o1, 4, %o1             ! advance dst by 4
        stb     %o4, [%o1-2]
        lduba   [%o0-1]%asi, %o4
        bnz,pt  %ncc, .ci_small3x
        stb     %o4, [%o1-1]
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        mov     %g0, %o0

.ci_align:
        bnz,pt  %ncc, .ci_al_d1
.ci_al_d1f:                             ! dest is now half word aligned
        andcc   %o1, 2, %o3             ! is dest word aligned
        bnz,pt  %ncc, .ci_al_d2
.ci_al_d2f:                             ! dest is now word aligned
        andcc   %o1, 4, %o3             ! is dest longword aligned?
        bz,pt   %ncc, .ci_al_src
        nop
.ci_al_d4:                              ! dest is word aligned;  src is unknown
        lduba   [%o0]%asi, %o4          ! move a word (src align unknown)
        lduba   [%o0+1]%asi, %o3
        sll     %o4, 24, %o4            ! position
        sll     %o3, 16, %o3            ! position
        or      %o4, %o3, %o3           ! merge
        lduba   [%o0+2]%asi, %o4
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o3           ! merge
        lduba   [%o0+3]%asi, %o4
        or      %o4, %o3, %o4           ! merge
        stw     %o4,[%o1]               ! store four bytes
        add     %o0, 4, %o0             ! adjust src by 4
        add     %o1, 4, %o1             ! adjust dest by 4
        sub     %o2, 4, %o2             ! adjust count by 4
        andcc   %o0, 7, %o3             ! check for src long word alignment
        brz,pt  %o3, .ci_medlong
.ci_src_dst_unal8:
        ! dst is 8-byte aligned, src is not
        ! Size is less than FP_COPY
        ! Following code is to select for alignment
        andcc   %o0, 0x3, %o3           ! test word alignment
        bz,pt   %ncc, .ci_medword
        nop
        andcc   %o0, 0x1, %o3           ! test halfword alignment
        bnz,pt  %ncc, .ci_med_byte      ! go to byte move if not halfword
        andcc   %o0, 0x2, %o3           ! test which byte alignment
        ba      .ci_medhalf
        nop
.ci_al_d1:                              ! align dest to half word
        lduba   [%o0]%asi, %o4          ! move a byte
        add     %o0, 1, %o0
        stb     %o4, [%o1]
        add     %o1, 1, %o1
        andcc   %o1, 2, %o3             ! is dest word aligned
        bz,pt   %ncc, .ci_al_d2f
        sub     %o2, 1, %o2
.ci_al_d2:                              ! align dest to word
        lduba   [%o0]%asi, %o4          ! move a half-word (src align unknown)
        lduba   [%o0+1]%asi, %o3
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o4           ! merge
        sth     %o4, [%o1]
        add     %o0, 2, %o0
        add     %o1, 2, %o1
        andcc   %o1, 4, %o3             ! is dest longword aligned?
        bz,pt   %ncc, .ci_al_src
        sub     %o2, 2, %o2
        ba      .ci_al_d4
        nop
/*
 * Handle all cases where src and dest are aligned on word
 * boundaries. Use unrolled loops for better performance.
 * This option wins over standard large data move when
 * source and destination is in cache for medium
 * to short data moves.
 */
.ci_medword:
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .ci_medw31
        nop
.ci_medw32:
        lda     [%o0]%asi, %o4          ! move a block of 32 bytes
        stw     %o4, [%o1]
        lda     [%o0+4]%asi, %o4
        stw     %o4, [%o1+4]
        lda     [%o0+8]%asi, %o4
        stw     %o4, [%o1+8]
        lda     [%o0+12]%asi, %o4
        stw     %o4, [%o1+12]
        lda     [%o0+16]%asi, %o4
        stw     %o4, [%o1+16]
        lda     [%o0+20]%asi, %o4
        subcc   %o2, 32, %o2            ! decrement length count
        stw     %o4, [%o1+20]
        lda     [%o0+24]%asi, %o4
        add     %o0, 32, %o0            ! increase src ptr by 32
        stw     %o4, [%o1+24]
        lda     [%o0-4]%asi, %o4
        add     %o1, 32, %o1            ! increase dst ptr by 32
        bgu,pt  %ncc, .ci_medw32        ! repeat if at least 32 bytes left
        stw     %o4, [%o1-4]
.ci_medw31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .ci_medw7         ! skip if 7 or fewer bytes left
        nop                             !
.ci_medw15:
        lda     [%o0]%asi, %o4          ! move a block of 8 bytes
        subcc   %o2, 8, %o2             ! decrement length count
        stw     %o4, [%o1]
        add     %o0, 8, %o0             ! increase src ptr by 8
        lda     [%o0-4]%asi, %o4
        add     %o1, 8, %o1             ! increase dst ptr by 8
        bgu,pt  %ncc, .ci_medw15
        stw     %o4, [%o1-4]
.ci_medw7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bz,pt   %ncc, .ci_smallx        ! exit if finished
        cmp     %o2, 4
        blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
        nop                             !
        lda     [%o0]%asi, %o4          ! move 4 bytes
        add     %o0, 4, %o0             ! increase src ptr by 4
        add     %o1, 4, %o1             ! increase dst ptr by 4
        subcc   %o2, 4, %o2             ! decrease count by 4
        bnz     .ci_small3x
        stw     %o4, [%o1-4]
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        mov     %g0, %o0

.ci_medhalf:
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .ci_medh31
        nop
.ci_medh32:                             ! load and store block of 32 bytes
        subcc   %o2, 32, %o2            ! decrement length count

        lduha   [%o0]%asi, %o4          ! move 32 bytes
        lduwa   [%o0+2]%asi, %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduha   [%o0+6]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1]

        lduha   [%o0+8]%asi, %o4
        lduwa   [%o0+10]%asi, %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduha   [%o0+14]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+8]

        lduha   [%o0+16]%asi, %o4
        lduwa   [%o0+18]%asi, %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduha   [%o0+22]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+16]

        add     %o0, 32, %o0            ! increase src ptr by 32
        add     %o1, 32, %o1            ! increase dst ptr by 32

        lduha   [%o0-8]%asi, %o4
        lduwa   [%o0-6]%asi, %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        lduha   [%o0-2]%asi, %o4
        or      %o3, %o4, %o4
        bgu,pt  %ncc, .ci_medh32        ! repeat if at least 32 bytes left
        stx     %o4, [%o1-8]

.ci_medh31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .ci_medh7         ! skip if 7 or fewer bytes left
        nop                             !
.ci_medh15:
        lduha   [%o0]%asi, %o4          ! move 16 bytes
        subcc   %o2, 8, %o2             ! decrement length count
        lduwa   [%o0+2]%asi, %o3
        sllx    %o4, 48, %o4
        sllx    %o3, 16, %o3
        or      %o4, %o3, %o3
        add     %o1, 8, %o1             ! increase dst ptr by 8
        lduha   [%o0+6]%asi, %o4
        add     %o0, 8, %o0             ! increase src ptr by 8
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .ci_medh15
        stx     %o4, [%o1-8]
.ci_medh7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bz,pt   %ncc, .ci_smallx        ! exit if finished
        cmp     %o2, 4
        blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
        nop                             !
        lduha   [%o0]%asi, %o4
        sll     %o4, 16, %o4
        lduha   [%o0+2]%asi, %o3
        or      %o3, %o4, %o4
        subcc   %o2, 4, %o2
        add     %o0, 4, %o0
        add     %o1, 4, %o1
        bnz     .ci_small3x
        stw     %o4, [%o1-4]
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        mov     %g0, %o0

        .align 16
.ci_med_byte:
        bnz,pt  %ncc, .ci_medbh32a      ! go to correct byte move
        subcc   %o2, 31, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .ci_medb31
        nop
.ci_medb32:                             ! Alignment 1 or 5
        subcc   %o2, 32, %o2            ! decrement length count

        lduba   [%o0]%asi, %o4          ! load and store a block of 32 bytes
        sllx    %o4, 56, %o3
        lduha   [%o0+1]%asi, %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduwa   [%o0+3]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0+7]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1]

        lduba   [%o0+8]%asi, %o4
        sllx    %o4, 56, %o3
        lduha   [%o0+9]%asi, %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduwa   [%o0+11]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0+15]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+8]

        lduba   [%o0+16]%asi, %o4
        sllx    %o4, 56, %o3
        lduha   [%o0+17]%asi, %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduwa   [%o0+19]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0+23]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+16]

        add     %o0, 32, %o0            ! increase src ptr by 32
        add     %o1, 32, %o1            ! increase dst ptr by 32

        lduba   [%o0-8]%asi, %o4
        sllx    %o4, 56, %o3
        lduha   [%o0-7]%asi, %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduwa   [%o0-5]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0-1]%asi, %o4
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .ci_medb32        ! repeat if at least 32 bytes left
        stx     %o4, [%o1-8]

.ci_medb31:                             ! 31 or fewer bytes remaining
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .ci_medb7         ! skip if 7 or fewer bytes left
        nop                             !
.ci_medb15:

        lduba   [%o0]%asi, %o4          ! load and store a block of 8 bytes
        subcc   %o2, 8, %o2             ! decrement length count
        sllx    %o4, 56, %o3
        lduha   [%o0+1]%asi, %o4
        sllx    %o4, 40, %o4
        or      %o4, %o3, %o3
        lduwa   [%o0+3]%asi, %o4
        add     %o1, 8, %o1             ! increase dst ptr by 16
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0+7]%asi, %o4
        add     %o0, 8, %o0             ! increase src ptr by 16
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .ci_medb15
        stx     %o4, [%o1-8]
.ci_medb7:
        addcc   %o2, 7, %o2             ! finish adjustment of remaining count
        bz,pt   %ncc, .ci_smallx        ! exit if finished
        cmp     %o2, 4
        blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
        nop                             !
        lduba   [%o0]%asi, %o4          ! move 4 bytes
        sll     %o4, 24, %o3
        lduha   [%o0+1]%asi, %o4
        sll     %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0+3]%asi, %o4
        or      %o4, %o3, %o4
        subcc   %o2, 4, %o2
        add     %o0, 4, %o0
        add     %o1, 4, %o1
        bnz     .ci_small3x
        stw     %o4, [%o1-4]
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        mov     %g0, %o0

        .align 16
.ci_medbh32a:                           ! Alignment 3 or 7
        ble,pt  %ncc, .ci_medbh31
        nop
.ci_medbh32:                            ! Alignment 3 or 7
        subcc   %o2, 32, %o2            ! decrement length count

        lduba   [%o0]%asi, %o4          ! load and store a block of 32 bytes
        sllx    %o4, 56, %o3
        lduwa   [%o0+1]%asi, %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduha   [%o0+5]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0+7]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1]

        lduba   [%o0+8]%asi, %o4
        sllx    %o4, 56, %o3
        lduwa   [%o0+9]%asi, %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduha   [%o0+13]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0+15]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+8]

        lduba   [%o0+16]%asi, %o4
        sllx    %o4, 56, %o3
        lduwa   [%o0+17]%asi, %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduha   [%o0+21]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0+23]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1+16]

        add     %o0, 32, %o0            ! increase src ptr by 32
        add     %o1, 32, %o1            ! increase dst ptr by 32

        lduba   [%o0-8]%asi, %o4
        sllx    %o4, 56, %o3
        lduwa   [%o0-7]%asi, %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduha   [%o0-3]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0-1]%asi, %o4
        or      %o4, %o3, %o4
        bgu,pt  %ncc, .ci_medbh32       ! repeat if at least 32 bytes left
        stx     %o4, [%o1-8]

.ci_medbh31:
        addcc   %o2, 24, %o2            ! adjust count to be off by 7
        ble,pt  %ncc, .ci_medb7         ! skip if 7 or fewer bytes left
        nop                             !
.ci_medbh15:
        lduba   [%o0]%asi, %o4          ! load and store a block of 8 bytes
        sllx    %o4, 56, %o3
        lduwa   [%o0+1]%asi, %o4
        sllx    %o4, 24, %o4
        or      %o4, %o3, %o3
        lduha   [%o0+5]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%o0+7]%asi, %o4
        or      %o4, %o3, %o4
        stx     %o4, [%o1]
        subcc   %o2, 8, %o2             ! decrement length count
        add     %o1, 8, %o1             ! increase dst ptr by 8
        add     %o0, 8, %o0             ! increase src ptr by 8
        bgu,pt  %ncc, .ci_medbh15
        stx     %o4, [%o1-8]
        ba      .ci_medb7
        nop

/*
 * End of small copy in code (no window)
 *
 */

/*
 * Long copy in code (using register window and fp regs)
 *
 */

.ci_copy_more:
        sethi   %hi(copyio_fault), %o3
        or      %o3, %lo(copyio_fault), %o3
        membar  #Sync
        stn     %o3, [THREAD_REG + T_LOFAULT]
/*
 * Following code is for large copies. We know there is at
 * least FP_COPY bytes available. FP regs are used, so
 *  we save registers and fp regs before starting
 */
        save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
        or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
        rd      %fprs, %g1              ! check for unused fp
        ! if fprs.fef == 0, set it.
        ! Setting it when already set costs more than checking
        andcc   %g1, FPRS_FEF, %g1      ! test FEF, fprs.du = fprs.dl = 0
        bz,pt   %ncc, .ci_fp_unused
        mov     ASI_USER, %asi
        BST_FP_TOSTACK(%o3)
        ba      .ci_fp_ready
.ci_fp_unused:
        prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read
        wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
.ci_fp_ready:
        rd      %gsr, %l5               ! save %gsr value
        andcc   %i1, 1, %o3             ! is dest byte aligned
        bnz,pt  %ncc, .ci_big_d1
.ci_big_d1f:                            ! dest is now half word aligned
        andcc   %i1, 2, %o3
        bnz,pt  %ncc, .ci_big_d2
.ci_big_d2f:                            ! dest is now word aligned
        andcc   %i1, 4, %o3
        bnz,pt  %ncc, .ci_big_d4
.ci_big_d4f:                            ! dest is long word aligned
        andcc   %i0, 7, %o3             ! is src long word aligned
        brnz,pt %o3, .ci_big_unal8
        prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read
        ! Src and dst are long word aligned
        ! align dst to 64 byte boundary
        andcc   %i1, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
        brz,pn  %o3, .ci_al_to_64
        nop
        sub     %o3, 64, %o3            ! %o3 has negative bytes to move
        add     %i2, %o3, %i2           ! adjust remaining count
        andcc   %o3, 8, %o4             ! odd long words to move?
        brz,pt  %o4, .ci_al_to_16
        nop
        add     %o3, 8, %o3
        ldxa    [%i0]%asi, %o4
        add     %i0, 8, %i0             ! increment src ptr
        add     %i1, 8, %i1             ! increment dst ptr
        stx     %o4, [%i1-8]
! Dest is aligned on 16 bytes, src 8 byte aligned
.ci_al_to_16:
        andcc   %o3, 0x30, %o4          ! pair of long words to move?
        brz,pt  %o4, .ci_al_to_64
        nop
.ci_al_mv_16:
        add     %o3, 16, %o3
        ldxa    [%i0]%asi, %o4
        stx     %o4, [%i1]
        add     %i0, 16, %i0            ! increment src ptr
        ldxa    [%i0-8]%asi, %o4
        stx     %o4, [%i1+8]
        andcc   %o3, 0x30, %o4
        brnz,pt %o4, .ci_al_mv_16
        add     %i1, 16, %i1            ! increment dst ptr
! Dest is aligned on 64 bytes, src 8 byte aligned
.ci_al_to_64:
        ! Determine source alignment
        ! to correct 8 byte offset
        andcc   %i0, 32, %o3
        brnz,pn %o3, .ci_aln_1
        andcc   %i0, 16, %o3
        brnz,pn %o3, .ci_aln_01
        andcc   %i0, 8, %o3
        brz,pn  %o3, .ci_aln_000
        prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
        ba      .ci_aln_001
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
.ci_aln_01:
        brnz,pn %o3, .ci_aln_011
        prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
        ba      .ci_aln_010
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
.ci_aln_1:
        andcc   %i0, 16, %o3
        brnz,pn %o3, .ci_aln_11
        andcc   %i0, 8, %o3
        brnz,pn %o3, .ci_aln_101
        prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
        ba      .ci_aln_100
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
.ci_aln_11:
        brz,pn  %o3, .ci_aln_110
        prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read

.ci_aln_111:
! Alignment off by 8 bytes
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        ldda    [%i0]%asi, %d0
        add     %i0, 8, %i0
        sub     %i2, 8, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.ci_aln_111_loop:
        ldda    [%i0]ASI_BLK_AIUS,%d16          ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d2
        fmovd   %d18, %d4
        fmovd   %d20, %d6
        fmovd   %d22, %d8
        fmovd   %d24, %d10
        fmovd   %d26, %d12
        fmovd   %d28, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d30, %d0
        bgt,pt  %ncc, .ci_aln_111_loop
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        ba      .ci_remain_stuff
        add     %i1, 8, %i1
        ! END OF aln_111

.ci_aln_110:
! Alignment off by 16 bytes
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        ldda    [%i0]%asi, %d0
        ldda    [%i0+8]%asi, %d2
        add     %i0, 16, %i0
        sub     %i2, 16, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.ci_aln_110_loop:
        ldda    [%i0]ASI_BLK_AIUS,%d16          ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d4
        fmovd   %d18, %d6
        fmovd   %d20, %d8
        fmovd   %d22, %d10
        fmovd   %d24, %d12
        fmovd   %d26, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d28, %d0
        fmovd   %d30, %d2
        bgt,pt  %ncc, .ci_aln_110_loop
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        ba      .ci_remain_stuff
        add     %i1, 16, %i1
        ! END OF aln_110

.ci_aln_101:
! Alignment off by 24 bytes
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        ldda    [%i0]%asi, %d0
        ldda    [%i0+8]%asi, %d2
        ldda    [%i0+16]%asi, %d4
        add     %i0, 24, %i0
        sub     %i2, 24, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.ci_aln_101_loop:
        ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d6
        fmovd   %d18, %d8
        fmovd   %d20, %d10
        fmovd   %d22, %d12
        fmovd   %d24, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d26, %d0
        fmovd   %d28, %d2
        fmovd   %d30, %d4
        bgt,pt  %ncc, .ci_aln_101_loop
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        ba      .ci_remain_stuff
        add     %i1, 24, %i1
        ! END OF aln_101

.ci_aln_100:
! Alignment off by 32 bytes
        ldda    [%i0]%asi, %d0
        ldda    [%i0+8]%asi, %d2
        ldda    [%i0+16]%asi,%d4
        ldda    [%i0+24]%asi,%d6
        add     %i0, 32, %i0
        sub     %i2, 32, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.ci_aln_100_loop:
        ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d8
        fmovd   %d18, %d10
        fmovd   %d20, %d12
        fmovd   %d22, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d24, %d0
        fmovd   %d26, %d2
        fmovd   %d28, %d4
        fmovd   %d30, %d6
        bgt,pt  %ncc, .ci_aln_100_loop
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        std     %d6, [%i1+24]
        ba      .ci_remain_stuff
        add     %i1, 32, %i1
        ! END OF aln_100

.ci_aln_011:
! Alignment off by 40 bytes
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        ldda    [%i0]%asi, %d0
        ldda    [%i0+8]%asi, %d2
        ldda    [%i0+16]%asi, %d4
        ldda    [%i0+24]%asi, %d6
        ldda    [%i0+32]%asi, %d8
        add     %i0, 40, %i0
        sub     %i2, 40, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.ci_aln_011_loop:
        ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d10
        fmovd   %d18, %d12
        fmovd   %d20, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d22, %d0
        fmovd   %d24, %d2
        fmovd   %d26, %d4
        fmovd   %d28, %d6
        fmovd   %d30, %d8
        bgt,pt  %ncc, .ci_aln_011_loop
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        std     %d6, [%i1+24]
        std     %d8, [%i1+32]
        ba      .ci_remain_stuff
        add     %i1, 40, %i1
        ! END OF aln_011

.ci_aln_010:
! Alignment off by 48 bytes
        ldda    [%i0]%asi, %d0
        ldda    [%i0+8]%asi, %d2
        ldda    [%i0+16]%asi, %d4
        ldda    [%i0+24]%asi, %d6
        ldda    [%i0+32]%asi, %d8
        ldda    [%i0+40]%asi, %d10
        add     %i0, 48, %i0
        sub     %i2, 48, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.ci_aln_010_loop:
        ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d12
        fmovd   %d18, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d20, %d0
        fmovd   %d22, %d2
        fmovd   %d24, %d4
        fmovd   %d26, %d6
        fmovd   %d28, %d8
        fmovd   %d30, %d10
        bgt,pt  %ncc, .ci_aln_010_loop
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        std     %d6, [%i1+24]
        std     %d8, [%i1+32]
        std     %d10, [%i1+40]
        ba      .ci_remain_stuff
        add     %i1, 48, %i1
        ! END OF aln_010

.ci_aln_001:
! Alignment off by 56 bytes
        ldda    [%i0]%asi, %d0
        ldda    [%i0+8]%asi, %d2
        ldda    [%i0+16]%asi, %d4
        ldda    [%i0+24]%asi, %d6
        ldda    [%i0+32]%asi, %d8
        ldda    [%i0+40]%asi, %d10
        ldda    [%i0+48]%asi, %d12
        add     %i0, 56, %i0
        sub     %i2, 56, %i2
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.ci_aln_001_loop:
        ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
        subcc   %o3, 64, %o3
        fmovd   %d16, %d14
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        fmovd   %d18, %d0
        fmovd   %d20, %d2
        fmovd   %d22, %d4
        fmovd   %d24, %d6
        fmovd   %d26, %d8
        fmovd   %d28, %d10
        fmovd   %d30, %d12
        bgt,pt  %ncc, .ci_aln_001_loop
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        add     %i1, %i0, %i1

        std     %d0, [%i1]
        std     %d2, [%i1+8]
        std     %d4, [%i1+16]
        std     %d6, [%i1+24]
        std     %d8, [%i1+32]
        std     %d10, [%i1+40]
        std     %d12, [%i1+48]
        ba      .ci_remain_stuff
        add     %i1, 56, %i1
        ! END OF aln_001

.ci_aln_000:
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
        and     %i2, 0x7f, %i2          ! residue bytes in %i2
        sub     %i1, %i0, %i1
.ci_aln_000_loop:
        ldda    [%i0]ASI_BLK_AIUS,%d0
        subcc   %o3, 64, %o3
        stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
        stda    %d0,[%i0+%i1]ASI_BLK_P
        add     %i0, 64, %i0
        bgt,pt  %ncc, .ci_aln_000_loop
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
        add     %i1, %i0, %i1

        ! END OF aln_000

.ci_remain_stuff:
        subcc   %i2, 31, %i2            ! adjust length to allow cc test
        ble,pt  %ncc, .ci_aln_31
        nop
.ci_aln_32:
        ldxa    [%i0]%asi, %o4          ! move 32 bytes
        subcc   %i2, 32, %i2            ! decrement length count by 32
        stx     %o4, [%i1]
        ldxa    [%i0+8]%asi, %o4
        stx     %o4, [%i1+8]
        ldxa    [%i0+16]%asi, %o4
        add     %i0, 32, %i0            ! increase src ptr by 32
        stx     %o4, [%i1+16]
        ldxa    [%i0-8]%asi, %o4
        add     %i1, 32, %i1            ! increase dst ptr by 32
        bgu,pt  %ncc, .ci_aln_32        ! repeat if at least 32 bytes left
        stx     %o4, [%i1-8]
.ci_aln_31:
        addcc   %i2, 24, %i2            ! adjust count to be off by 7
        ble,pt  %ncc, .ci_aln_7         ! skip if 7 or fewer bytes left
        nop                             !
.ci_aln_15:
        ldxa    [%i0]%asi, %o4          ! move 8 bytes
        add     %i0, 8, %i0             ! increase src ptr by 8
        subcc   %i2, 8, %i2             ! decrease count by 8
        add     %i1, 8, %i1             ! increase dst ptr by 8
        bgu,pt  %ncc, .ci_aln_15
        stx     %o4, [%i1-8]            !
.ci_aln_7:
        addcc   %i2, 7, %i2             ! finish adjustment of remaining count
        bz,pt   %ncc, .ci_exit          ! exit if finished
        cmp     %i2, 4
        blt,pt  %ncc, .ci_unaln3x       ! skip if less than 4 bytes left
        nop                             !
        lda     [%i0]%asi, %o4          ! move 4 bytes
        add     %i0, 4, %i0             ! increase src ptr by 4
        add     %i1, 4, %i1             ! increase dst ptr by 4
        subcc   %i2, 4, %i2             ! decrease count by 4
        bnz     .ci_unaln3x
        stw     %o4, [%i1-4]
        ba      .ci_exit
        nop

        ! destination alignment code
.ci_big_d1:
        lduba   [%i0]%asi, %o4          ! move a byte
        add     %i0, 1, %i0
        stb     %o4, [%i1]
        add     %i1, 1, %i1
        andcc   %i1, 2, %o3
        bz,pt   %ncc, .ci_big_d2f
        sub     %i2, 1, %i2
.ci_big_d2:                             ! dest is now at least half word aligned
        lduba   [%i0]%asi, %o4          ! move a half-word (src align unknown)
        lduba   [%i0+1]%asi, %o3
        add     %i0, 2, %i0
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o4           ! merge
        sth     %o4, [%i1]
        add     %i1, 2, %i1
        andcc   %i1, 4, %o3
        bz,pt   %ncc, .ci_big_d4f
        sub     %i2, 2, %i2
.ci_big_d4:                             ! dest is at least word aligned
        nop
        lduba   [%i0]%asi, %o4          ! move a word (src align unknown)
        lduba   [%i0+1]%asi, %o3
        sll     %o4, 24, %o4            ! position
        sll     %o3, 16, %o3            ! position
        or      %o4, %o3, %o3           ! merge
        lduba   [%i0+2]%asi, %o4
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o3           ! merge
        lduba   [%i0+3]%asi, %o4
        or      %o4, %o3, %o4           ! merge
        stw     %o4,[%i1]               ! store four bytes
        add     %i0, 4, %i0             ! adjust src by 4
        add     %i1, 4, %i1             ! adjust dest by 4
        ba      .ci_big_d4f
        sub     %i2, 4, %i2             ! adjust count by 4


        ! Dst is on 8 byte boundary; src is not;
.ci_big_unal8:
        andcc   %i1, 0x3f, %o3          ! is dst 64-byte block aligned?
        bz      %ncc, .ci_unalnsrc
        sub     %o3, 64, %o3            ! %o3 will be multiple of 8
        neg     %o3                     ! bytes until dest is 64 byte aligned
        sub     %i2, %o3, %i2           ! update cnt with bytes to be moved
        ! Move bytes according to source alignment
        andcc   %i0, 0x1, %o4
        bnz     %ncc, .ci_unalnbyte     ! check for byte alignment
        nop
        andcc   %i0, 2, %o4             ! check for half word alignment
        bnz     %ncc, .ci_unalnhalf
        nop
        ! Src is word aligned, move bytes until dest 64 byte aligned
.ci_unalnword:
        lda     [%i0]%asi, %o4          ! load 4 bytes
        stw     %o4, [%i1]              ! and store 4 bytes
        lda     [%i0+4]%asi, %o4        ! load 4 bytes
        add     %i0, 8, %i0             ! increase src ptr by 8
        stw     %o4, [%i1+4]            ! and store 4 bytes
        subcc   %o3, 8, %o3             ! decrease count by 8
        bnz     %ncc, .ci_unalnword
        add     %i1, 8, %i1             ! increase dst ptr by 8
        ba      .ci_unalnsrc
        nop

        ! Src is half-word aligned, move bytes until dest 64 byte aligned
.ci_unalnhalf:
        lduha   [%i0]%asi, %o4          ! load 2 bytes
        sllx    %o4, 32, %i3            ! shift left
        lduwa   [%i0+2]%asi, %o4
        or      %o4, %i3, %i3
        sllx    %i3, 16, %i3
        lduha   [%i0+6]%asi, %o4
        or      %o4, %i3, %i3
        stx     %i3, [%i1]
        add     %i0, 8, %i0
        subcc   %o3, 8, %o3
        bnz     %ncc, .ci_unalnhalf
        add     %i1, 8, %i1
        ba      .ci_unalnsrc
        nop

        ! Src is Byte aligned, move bytes until dest 64 byte aligned
.ci_unalnbyte:
        sub     %i1, %i0, %i1           ! share pointer advance
.ci_unalnbyte_loop:
        lduba   [%i0]%asi, %o4
        sllx    %o4, 56, %i3
        lduha   [%i0+1]%asi, %o4
        sllx    %o4, 40, %o4
        or      %o4, %i3, %i3
        lduha   [%i0+3]%asi, %o4
        sllx    %o4, 24, %o4
        or      %o4, %i3, %i3
        lduha   [%i0+5]%asi, %o4
        sllx    %o4, 8, %o4
        or      %o4, %i3, %i3
        lduba   [%i0+7]%asi, %o4
        or      %o4, %i3, %i3
        stx     %i3, [%i1+%i0]
        subcc   %o3, 8, %o3
        bnz     %ncc, .ci_unalnbyte_loop
        add     %i0, 8, %i0
        add     %i1,%i0, %i1            ! restore pointer

        ! Destination is now block (64 byte aligned), src is not 8 byte aligned
.ci_unalnsrc:
        andn    %i2, 0x3f, %i3          ! %i3 is multiple of block size
        and     %i2, 0x3f, %i2          ! residue bytes in %i2
        add     %i2, 64, %i2            ! Insure we don't load beyond
        sub     %i3, 64, %i3            ! end of source buffer

        andn    %i0, 0x3f, %o4          ! %o4 has block aligned src address
        prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read
        alignaddr %i0, %g0, %g0         ! generate %gsr
        add     %i0, %i3, %i0           ! advance %i0 to after blocks
        !
        ! Determine source alignment to correct 8 byte offset
        andcc   %i0, 0x20, %o3
        brnz,pn %o3, .ci_unaln_1
        andcc   %i0, 0x10, %o3
        brnz,pn %o3, .ci_unaln_01
        andcc   %i0, 0x08, %o3
        brz,a   %o3, .ci_unaln_000
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_001
        nop
.ci_unaln_01:
        brnz,a  %o3, .ci_unaln_011
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_010
        nop
.ci_unaln_1:
        brnz,pn %o3, .ci_unaln_11
        andcc   %i0, 0x08, %o3
        brnz,a  %o3, .ci_unaln_101
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_100
        nop
.ci_unaln_11:
        brz,pn  %o3, .ci_unaln_110
        prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read

.ci_unaln_111:
        ldda    [%o4+56]%asi, %d14
.ci_unaln_111_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_AIUS, %d16
        faligndata %d14, %d16, %d48
        faligndata %d16, %d18, %d50
        faligndata %d18, %d20, %d52
        faligndata %d20, %d22, %d54
        faligndata %d22, %d24, %d56
        faligndata %d24, %d26, %d58
        faligndata %d26, %d28, %d60
        faligndata %d28, %d30, %d62
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .ci_unaln_111_loop
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_done
        nop

.ci_unaln_110:
        ldda    [%o4+48]%asi, %d12
        ldda    [%o4+56]%asi, %d14
.ci_unaln_110_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_AIUS, %d16
        faligndata %d12, %d14, %d48
        faligndata %d14, %d16, %d50
        faligndata %d16, %d18, %d52
        faligndata %d18, %d20, %d54
        faligndata %d20, %d22, %d56
        faligndata %d22, %d24, %d58
        faligndata %d24, %d26, %d60
        faligndata %d26, %d28, %d62
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .ci_unaln_110_loop
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_done
        nop

.ci_unaln_101:
        ldda    [%o4+40]%asi, %d10
        ldda    [%o4+48]%asi, %d12
        ldda    [%o4+56]%asi, %d14
.ci_unaln_101_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_AIUS, %d16
        faligndata %d10, %d12, %d48
        faligndata %d12, %d14, %d50
        faligndata %d14, %d16, %d52
        faligndata %d16, %d18, %d54
        faligndata %d18, %d20, %d56
        faligndata %d20, %d22, %d58
        faligndata %d22, %d24, %d60
        faligndata %d24, %d26, %d62
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .ci_unaln_101_loop
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_done
        nop

.ci_unaln_100:
        ldda    [%o4+32]%asi, %d8
        ldda    [%o4+40]%asi, %d10
        ldda    [%o4+48]%asi, %d12
        ldda    [%o4+56]%asi, %d14
.ci_unaln_100_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_AIUS, %d16
        faligndata %d8, %d10, %d48
        faligndata %d10, %d12, %d50
        faligndata %d12, %d14, %d52
        faligndata %d14, %d16, %d54
        faligndata %d16, %d18, %d56
        faligndata %d18, %d20, %d58
        faligndata %d20, %d22, %d60
        faligndata %d22, %d24, %d62
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .ci_unaln_100_loop
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_done
        nop

.ci_unaln_011:
        ldda    [%o4+24]%asi, %d6
        ldda    [%o4+32]%asi, %d8
        ldda    [%o4+40]%asi, %d10
        ldda    [%o4+48]%asi, %d12
        ldda    [%o4+56]%asi, %d14
.ci_unaln_011_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_AIUS, %d16
        faligndata %d6, %d8, %d48
        faligndata %d8, %d10, %d50
        faligndata %d10, %d12, %d52
        faligndata %d12, %d14, %d54
        faligndata %d14, %d16, %d56
        faligndata %d16, %d18, %d58
        faligndata %d18, %d20, %d60
        faligndata %d20, %d22, %d62
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .ci_unaln_011_loop
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_done
        nop

.ci_unaln_010:
        ldda    [%o4+16]%asi, %d4
        ldda    [%o4+24]%asi, %d6
        ldda    [%o4+32]%asi, %d8
        ldda    [%o4+40]%asi, %d10
        ldda    [%o4+48]%asi, %d12
        ldda    [%o4+56]%asi, %d14
.ci_unaln_010_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_AIUS, %d16
        faligndata %d4, %d6, %d48
        faligndata %d6, %d8, %d50
        faligndata %d8, %d10, %d52
        faligndata %d10, %d12, %d54
        faligndata %d12, %d14, %d56
        faligndata %d14, %d16, %d58
        faligndata %d16, %d18, %d60
        faligndata %d18, %d20, %d62
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .ci_unaln_010_loop
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_done
        nop

.ci_unaln_001:
        ldda    [%o4+8]%asi, %d2
        ldda    [%o4+16]%asi, %d4
        ldda    [%o4+24]%asi, %d6
        ldda    [%o4+32]%asi, %d8
        ldda    [%o4+40]%asi, %d10
        ldda    [%o4+48]%asi, %d12
        ldda    [%o4+56]%asi, %d14
.ci_unaln_001_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_AIUS, %d16
        faligndata %d2, %d4, %d48
        faligndata %d4, %d6, %d50
        faligndata %d6, %d8, %d52
        faligndata %d8, %d10, %d54
        faligndata %d10, %d12, %d56
        faligndata %d12, %d14, %d58
        faligndata %d14, %d16, %d60
        faligndata %d16, %d18, %d62
        fmovd   %d18, %d2
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .ci_unaln_001_loop
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
        ba      .ci_unaln_done
        nop

.ci_unaln_000:
        ldda    [%o4]ASI_BLK_AIUS, %d0
.ci_unaln_000_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_AIUS, %d16
        faligndata %d0, %d2, %d48
        faligndata %d2, %d4, %d50
        faligndata %d4, %d6, %d52
        faligndata %d6, %d8, %d54
        faligndata %d8, %d10, %d56
        faligndata %d10, %d12, %d58
        faligndata %d12, %d14, %d60
        faligndata %d14, %d16, %d62
        fmovd   %d16, %d0
        fmovd   %d18, %d2
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%i1]ASI_BLK_P
        subcc   %i3, 64, %i3
        add     %i1, 64, %i1
        bgu,pt  %ncc, .ci_unaln_000_loop
        prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read

.ci_unaln_done:
        ! Handle trailing bytes, 64 to 127
        ! Dest long word aligned, Src not long word aligned
        cmp     %i2, 15
        bleu    %ncc, .ci_unaln_short

        andn    %i2, 0x7, %i3           ! %i3 is multiple of 8
        and     %i2, 0x7, %i2           ! residue bytes in %i2
        add     %i2, 8, %i2
        sub     %i3, 8, %i3             ! insure we don't load past end of src
        andn    %i0, 0x7, %o4           ! %o4 has long word aligned src address
        add     %i0, %i3, %i0           ! advance %i0 to after multiple of 8
        ldda    [%o4]%asi, %d0          ! fetch partial word
.ci_unaln_by8:
        ldda    [%o4+8]%asi, %d2
        add     %o4, 8, %o4
        faligndata %d0, %d2, %d16
        subcc   %i3, 8, %i3
        std     %d16, [%i1]
        fmovd   %d2, %d0
        bgu,pt  %ncc, .ci_unaln_by8
        add     %i1, 8, %i1

.ci_unaln_short:
        cmp     %i2, 8
        blt,pt  %ncc, .ci_unalnfin
        nop
        lduba   [%i0]%asi, %o4
        sll     %o4, 24, %o3
        lduba   [%i0+1]%asi, %o4
        sll     %o4, 16, %o4
        or      %o4, %o3, %o3
        lduba   [%i0+2]%asi, %o4
        sll     %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%i0+3]%asi, %o4
        or      %o4, %o3, %o3
        stw     %o3, [%i1]
        lduba   [%i0+4]%asi, %o4
        sll     %o4, 24, %o3
        lduba   [%i0+5]%asi, %o4
        sll     %o4, 16, %o4
        or      %o4, %o3, %o3
        lduba   [%i0+6]%asi, %o4
        sll     %o4, 8, %o4
        or      %o4, %o3, %o3
        lduba   [%i0+7]%asi, %o4
        or      %o4, %o3, %o3
        stw     %o3, [%i1+4]
        add     %i0, 8, %i0
        add     %i1, 8, %i1
        sub     %i2, 8, %i2
.ci_unalnfin:
        cmp     %i2, 4
        blt,pt  %ncc, .ci_unalnz
        tst     %i2
        lduba   [%i0]%asi, %o3          ! read byte
        subcc   %i2, 4, %i2             ! reduce count by 4
        sll     %o3, 24, %o3            ! position
        lduba   [%i0+1]%asi, %o4
        sll     %o4, 16, %o4            ! position
        or      %o4, %o3, %o3           ! merge
        lduba   [%i0+2]%asi, %o4
        sll     %o4, 8, %o4             ! position
        or      %o4, %o3, %o3           ! merge
        add     %i1, 4, %i1             ! advance dst by 4
        lduba   [%i0+3]%asi, %o4
        add     %i0, 4, %i0             ! advance src by 4
        or      %o4, %o3, %o4           ! merge
        bnz,pt  %ncc, .ci_unaln3x
        stw     %o4, [%i1-4]
        ba      .ci_exit
        nop
.ci_unalnz:
        bz,pt   %ncc, .ci_exit
        wr      %l5, %g0, %gsr          ! restore %gsr
.ci_unaln3x:                            ! Exactly 1, 2, or 3 bytes remain
        subcc   %i2, 1, %i2             ! reduce count for cc test
        lduba   [%i0]%asi, %o4          ! load one byte
        bz,pt   %ncc, .ci_exit
        stb     %o4, [%i1]              ! store one byte
        lduba   [%i0+1]%asi, %o4        ! load second byte
        subcc   %i2, 1, %i2
        bz,pt   %ncc, .ci_exit
        stb     %o4, [%i1+1]            ! store second byte
        lduba   [%i0+2]%asi, %o4        ! load third byte
        stb     %o4, [%i1+2]            ! store third byte
.ci_exit:
        brnz    %g1, .ci_fp_restore
        nop
        FZERO
        wr      %g1, %g0, %fprs
        ba,pt   %ncc, .ci_ex2
        membar  #Sync
.ci_fp_restore:
        BLD_FP_FROMSTACK(%o4)
.ci_ex2:
        andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
        ret
        restore %g0, 0, %o0

.copyin_err:
        ldn     [THREAD_REG + T_COPYOPS], %o4
        brz     %o4, 2f
        nop
        ldn     [%o4 + CP_COPYIN], %g2
        jmp     %g2
        nop
2:
        retl
        mov     -1, %o0

#else   /* NIAGARA_IMPL */
.do_copyin:
        !
        ! Check the length and bail if zero.
        !
        tst     %o2
        bnz,pt  %ncc, 1f
        nop
        retl
        clr     %o0
1:
        sethi   %hi(copyio_fault), %o4
        or      %o4, %lo(copyio_fault), %o4
        sethi   %hi(copyio_fault_nowindow), %o3
        ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
        or      %o3, %lo(copyio_fault_nowindow), %o3
        membar  #Sync
        stn     %o3, [THREAD_REG + T_LOFAULT]

        mov     %o0, SAVE_SRC
        mov     %o1, SAVE_DST
        mov     %o2, SAVE_COUNT

        !
        ! Check to see if we're more than SMALL_LIMIT.
        !
        subcc   %o2, SMALL_LIMIT, %o3
        bgu,a,pt %ncc, .dci_ns
        or      %o0, %o1, %o3
        !
        ! What was previously ".small_copyin"
        !
.dcibcp:
        sub     %g0, %o2, %o3           ! setup for copy loop
        add     %o0, %o2, %o0
        add     %o1, %o2, %o1
        ba,pt   %ncc, .dcicl
        lduba   [%o0 + %o3]ASI_USER, %o4
        !
        ! %o0 and %o1 point at the end and remain pointing at the end
        ! of their buffers. We pull things out by adding %o3 (which is
        ! the negation of the length) to the buffer end which gives us
        ! the curent location in the buffers. By incrementing %o3 we walk
        ! through both buffers without having to bump each buffer's
        ! pointer. A very fast 4 instruction loop.
        !
        .align 16
.dcicl:
        stb     %o4, [%o1 + %o3]
        inccc   %o3
        bl,a,pt %ncc, .dcicl
        lduba   [%o0 + %o3]ASI_USER, %o4
        !
        ! We're done. Go home.
        !
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
        retl
        clr     %o0
        !
        ! Try aligned copies from here.
        !
.dci_ns:
        !
        ! See if we're single byte aligned. If we are, check the
        ! limit for single byte copies. If we're smaller, or equal,
        ! bounce to the byte for byte copy loop. Otherwise do it in
        ! HW (if enabled).
        !
        btst    1, %o3
        bz,a,pt %icc, .dcih8
        btst    7, %o3
        !
        ! We're single byte aligned.
        !
        sethi   %hi(hw_copy_limit_1), %o3
        ld      [%o3 + %lo(hw_copy_limit_1)], %o3
        !
        ! Is HW copy on? If not do everything byte for byte.
        !
        tst     %o3
        bz,pn   %icc, .dcibcp
        subcc   %o3, %o2, %o3
        !
        ! Are we bigger than the HW limit? If not
        ! go to byte for byte.
        !
        bge,pt  %ncc, .dcibcp
        nop
        !
        ! We're big enough and copy is on. Do it with HW.
        !
        ba,pt   %ncc, .big_copyin
        nop
.dcih8:
        !
        ! 8 byte aligned?
        !
        bnz,a   %ncc, .dcih4
        btst    3, %o3
        !
        ! We're eight byte aligned.
        !
        sethi   %hi(hw_copy_limit_8), %o3
        ld      [%o3 + %lo(hw_copy_limit_8)], %o3
        !
        ! Is HW assist on? If not, do it with the aligned copy.
        !
        tst     %o3
        bz,pn   %icc, .dcis8
        subcc   %o3, %o2, %o3
        bge     %ncc, .dcis8
        nop
        ba,pt   %ncc, .big_copyin
        nop
.dcis8:
        !
        ! Housekeeping for copy loops. Uses same idea as in the byte for
        ! byte copy loop above.
        !
        add     %o0, %o2, %o0
        add     %o1, %o2, %o1
        sub     %g0, %o2, %o3
        ba,pt   %ncc, .didebc
        srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
        !
        ! 4 byte aligned?
        !
.dcih4:
        bnz     %ncc, .dcih2
        sethi   %hi(hw_copy_limit_4), %o3
        ld      [%o3 + %lo(hw_copy_limit_4)], %o3
        !
        ! Is HW assist on? If not, do it with the aligned copy.
        !
        tst     %o3
        bz,pn   %icc, .dcis4
        subcc   %o3, %o2, %o3
        !
        ! We're negative if our size is less than or equal to hw_copy_limit_4.
        !
        bge     %ncc, .dcis4
        nop
        ba,pt   %ncc, .big_copyin
        nop
.dcis4:
        !
        ! Housekeeping for copy loops. Uses same idea as in the byte
        ! for byte copy loop above.
        !
        add     %o0, %o2, %o0
        add     %o1, %o2, %o1
        sub     %g0, %o2, %o3
        ba,pt   %ncc, .didfbc
        srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
.dcih2:
        !
        ! We're two byte aligned. Check for "smallness"
        ! done in delay at .dcih4
        !
        bleu,pt %ncc, .dcis2
        sethi   %hi(hw_copy_limit_2), %o3
        ld      [%o3 + %lo(hw_copy_limit_2)], %o3
        !
        ! Is HW assist on? If not, do it with the aligned copy.
        !
        tst     %o3
        bz,pn   %icc, .dcis2
        subcc   %o3, %o2, %o3
        !
        ! Are we larger than the HW limit?
        !
        bge     %ncc, .dcis2
        nop
        !
        ! HW assist is on and we're large enough to use it.
        !
        ba,pt   %ncc, .big_copyin
        nop
        !
        ! Housekeeping for copy loops. Uses same idea as in the byte
        ! for byte copy loop above.
        !
.dcis2:
        add     %o0, %o2, %o0
        add     %o1, %o2, %o1
        sub     %g0, %o2, %o3
        ba,pt   %ncc, .didtbc
        srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
        !
.small_copyin:
        !
        ! Why are we doing this AGAIN? There are certain conditions in
        ! big copyin that will cause us to forgo the HW assisted copys
        ! and bounce back to a non-hw assisted copy. This dispatches
        ! those copies. Note that we branch around this in the main line
        ! code.
        !
        ! We make no check for limits or HW enablement here. We've
        ! already been told that we're a poster child so just go off
        ! and do it.
        !
        or      %o0, %o1, %o3
        btst    1, %o3
        bnz     %icc, .dcibcp           ! Most likely
        btst    7, %o3
        bz      %icc, .dcis8
        btst    3, %o3
        bz      %icc, .dcis4
        nop
        ba,pt   %ncc, .dcis2
        nop
        !
        ! Eight byte aligned copies. A steal from the original .small_copyin
        ! with modifications. %o2 is number of 8 byte chunks to copy. When
        ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
        ! to copy.
        !
        .align 32
.didebc:
        ldxa    [%o0 + %o3]ASI_USER, %o4
        deccc   %o2
        stx     %o4, [%o1 + %o3]
        bg,pt   %ncc, .didebc
        addcc   %o3, 8, %o3
        !
        ! End of copy loop. Most 8 byte aligned copies end here.
        !
        bz,pt   %ncc, .dcifh
        nop
        !
        ! Something is left. Do it byte for byte.
        !
        ba,pt   %ncc, .dcicl
        lduba   [%o0 + %o3]ASI_USER, %o4
        !
        ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
        !
        .align 32
.didfbc:
        lduwa   [%o0 + %o3]ASI_USER, %o4
        deccc   %o2
        st      %o4, [%o1 + %o3]
        bg,pt   %ncc, .didfbc
        addcc   %o3, 4, %o3
        !
        ! End of copy loop. Most 4 byte aligned copies end here.
        !
        bz,pt   %ncc, .dcifh
        nop
        !
        ! Something is left. Do it byte for byte.
        !
        ba,pt   %ncc, .dcicl
        lduba   [%o0 + %o3]ASI_USER, %o4
        !
        ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
        ! copy.
        !
        .align 32
.didtbc:
        lduha   [%o0 + %o3]ASI_USER, %o4
        deccc   %o2
        sth     %o4, [%o1 + %o3]
        bg,pt   %ncc, .didtbc
        addcc   %o3, 2, %o3
        !
        ! End of copy loop. Most 2 byte aligned copies end here.
        !
        bz,pt   %ncc, .dcifh
        nop
        !
        ! Deal with the last byte
        !
        lduba   [%o0 + %o3]ASI_USER, %o4
        stb     %o4, [%o1 + %o3]
.dcifh:
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
        retl
        clr     %o0

.big_copyin:
        ! We're going off to do a block copy.
        ! Switch fault hendlers and grab a window. We
        ! don't do a membar #Sync since we've done only
        ! kernel data to this point.
        stn     %o4, [THREAD_REG + T_LOFAULT]

        ! Copy in that reach here are larger than 256 bytes. The
        ! hw_copy_limit_1 is set to 256. Never set this limit less
        ! 128 bytes.
        save    %sp, -SA(MINFRAME), %sp
.do_blockcopyin:

        ! Swap src/dst since the code below is memcpy code
        ! and memcpy/bcopy have different calling sequences
        mov     %i1, %i5
        mov     %i0, %i1
        mov     %i5, %i0

        ! Block (64 bytes) align the destination.
        andcc   %i0, 0x3f, %i3          ! is dst block aligned
        bz      %ncc, copyin_blalign    ! dst already block aligned
        sub     %i3, 0x40, %i3
        neg     %i3                     ! bytes till dst 64 bytes aligned
        sub     %i2, %i3, %i2           ! update i2 with new count

        ! Based on source and destination alignment do
        ! either 8 bytes, 4 bytes, 2 bytes or byte copy.

        ! Is dst & src 8B aligned
        or      %i0, %i1, %o2
        andcc   %o2, 0x7, %g0
        bz      %ncc, .ci_alewdcp
        nop

        ! Is dst & src 4B aligned
        andcc   %o2, 0x3, %g0
        bz      %ncc, .ci_alwdcp
        nop

        ! Is dst & src 2B aligned
        andcc   %o2, 0x1, %g0
        bz      %ncc, .ci_alhlfwdcp
        nop

        ! 1B aligned
1:      lduba   [%i1]ASI_USER, %o2
        stb     %o2, [%i0]
        inc     %i1
        deccc   %i3
        bgu,pt  %ncc, 1b
        inc     %i0

        ba      copyin_blalign
        nop

        ! dst & src 4B aligned
.ci_alwdcp:
        lda     [%i1]ASI_USER, %o2
        st      %o2, [%i0]
        add     %i1, 0x4, %i1
        subcc   %i3, 0x4, %i3
        bgu,pt  %ncc, .ci_alwdcp
        add     %i0, 0x4, %i0

        ba      copyin_blalign
        nop

        ! dst & src 2B aligned
.ci_alhlfwdcp:
        lduha   [%i1]ASI_USER, %o2
        stuh    %o2, [%i0]
        add     %i1, 0x2, %i1
        subcc   %i3, 0x2, %i3
        bgu,pt  %ncc, .ci_alhlfwdcp
        add     %i0, 0x2, %i0

        ba      copyin_blalign
        nop

        ! dst & src 8B aligned
.ci_alewdcp:
        ldxa    [%i1]ASI_USER, %o2
        stx     %o2, [%i0]
        add     %i1, 0x8, %i1
        subcc   %i3, 0x8, %i3
        bgu,pt  %ncc, .ci_alewdcp
        add     %i0, 0x8, %i0

copyin_blalign:
        andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
        sub     %i2, %i3, %i2           ! Residue bytes in %i2

        mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi

        andcc   %i1, 0xf, %o2           ! is src quadword aligned
        bz,pn   %xcc, .ci_blkcpy        ! src offset in %o2 (last 4-bits)
        nop
        cmp     %o2, 0x8
        bg      .ci_upper_double
        nop
        bl      .ci_lower_double
        nop

        ! Falls through when source offset is equal to 8 i.e.
        ! source is double word aligned.
        ! In this case no shift/merge of data is required

        sub     %i1, %o2, %i1           ! align the src at 16 bytes.
        andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
        prefetcha [%l0]ASI_USER, #one_read
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
        add     %l0, 0x40, %l0
.ci_loop0:
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4

        prefetcha [%l0]ASI_USER, #one_read

        stxa    %l3, [%i0+0x0]%asi
        stxa    %l4, [%i0+0x8]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2

        stxa    %l5, [%i0+0x10]%asi
        stxa    %l2, [%i0+0x18]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4

        stxa    %l3, [%i0+0x20]%asi
        stxa    %l4, [%i0+0x28]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2

        stxa    %l5, [%i0+0x30]%asi
        stxa    %l2, [%i0+0x38]%asi

        add     %l0, 0x40, %l0
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, .ci_loop0
        add     %i0, 0x40, %i0
        ba      .ci_blkdone
        add     %i1, %o2, %i1           ! increment the source by src offset
                                        ! the src offset was stored in %o2

.ci_lower_double:

        sub     %i1, %o2, %i1           ! align the src at 16 bytes.
        sll     %o2, 3, %o0             ! %o0 left shift
        mov     0x40, %o1
        sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
        andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
        prefetcha [%l0]ASI_USER, #one_read
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2    ! partial data in %l2
                                                        ! and %l3 has complete
                                                        ! data
        add     %l0, 0x40, %l0
.ci_loop1:
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4    ! %l4 has partial data
                                                        ! for this read.
        ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
                                                        ! into %l2 and %l3

        prefetcha [%l0]ASI_USER, #one_read

        stxa    %l2, [%i0+0x0]%asi
        stxa    %l3, [%i0+0x8]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
        ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
                                                        ! %l4 from previous read
                                                        ! into %l4 and %l5
        stxa    %l4, [%i0+0x10]%asi
        stxa    %l5, [%i0+0x18]%asi

        ! Repeat the same for next 32 bytes.

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
        ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)

        stxa    %l2, [%i0+0x20]%asi
        stxa    %l3, [%i0+0x28]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
        ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)

        stxa    %l4, [%i0+0x30]%asi
        stxa    %l5, [%i0+0x38]%asi

        add     %l0, 0x40, %l0
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, .ci_loop1
        add     %i0, 0x40, %i0
        ba      .ci_blkdone
        add     %i1, %o2, %i1           ! increment the source by src offset
                                        ! the src offset was stored in %o2

.ci_upper_double:

        sub     %i1, %o2, %i1           ! align the src at 16 bytes.
        sub     %o2, 0x8, %o0
        sll     %o0, 3, %o0             ! %o0 left shift
        mov     0x40, %o1
        sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
        andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
        prefetcha [%l0]ASI_USER, #one_read
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2    ! partial data in %l3
                                                        ! for this read and
                                                        ! no data in %l2
        add     %l0, 0x40, %l0
.ci_loop2:
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4    ! %l4 has complete data
                                                        ! and %l5 has partial
        ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
                                                        ! into %l3 and %l4
        prefetcha [%l0]ASI_USER, #one_read

        stxa    %l3, [%i0+0x0]%asi
        stxa    %l4, [%i0+0x8]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
        ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
                                                        ! %l5 from previous read
                                                        ! into %l5 and %l2

        stxa    %l5, [%i0+0x10]%asi
        stxa    %l2, [%i0+0x18]%asi

        ! Repeat the same for next 32 bytes.

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
        ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)

        stxa    %l3, [%i0+0x20]%asi
        stxa    %l4, [%i0+0x28]%asi

        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
        ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)

        stxa    %l5, [%i0+0x30]%asi
        stxa    %l2, [%i0+0x38]%asi

        add     %l0, 0x40, %l0
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, .ci_loop2
        add     %i0, 0x40, %i0
        ba      .ci_blkdone
        add     %i1, %o2, %i1           ! increment the source by src offset
                                        ! the src offset was stored in %o2


        ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.ci_blkcpy:

        andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
        prefetcha [%o0]ASI_USER, #one_read
        add     %o0, 0x40, %o0
1:
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
        add     %i1, 0x10, %i1

        prefetcha [%o0]ASI_USER, #one_read

        stxa    %l0, [%i0+0x0]%asi

        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
        add     %i1, 0x10, %i1
        ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
        add     %i1, 0x10, %i1

        stxa    %l1, [%i0+0x8]%asi
        stxa    %l2, [%i0+0x10]%asi
        stxa    %l3, [%i0+0x18]%asi
        stxa    %l4, [%i0+0x20]%asi
        stxa    %l5, [%i0+0x28]%asi
        stxa    %l6, [%i0+0x30]%asi
        stxa    %l7, [%i0+0x38]%asi

        add     %o0, 0x40, %o0
        subcc   %i3, 0x40, %i3
        bgu,pt  %xcc, 1b
        add     %i0, 0x40, %i0

.ci_blkdone:
        membar  #Sync

        brz,pt  %i2, .copyin_exit
        nop

        ! Handle trailing bytes
        cmp     %i2, 0x8
        blu,pt  %ncc, .ci_residue
        nop

        ! Can we do some 8B ops
        or      %i1, %i0, %o2
        andcc   %o2, 0x7, %g0
        bnz     %ncc, .ci_last4
        nop

        ! Do 8byte ops as long as possible
.ci_last8:
        ldxa    [%i1]ASI_USER, %o2
        stx     %o2, [%i0]
        add     %i1, 0x8, %i1
        sub     %i2, 0x8, %i2
        cmp     %i2, 0x8
        bgu,pt  %ncc, .ci_last8
        add     %i0, 0x8, %i0

        brz,pt  %i2, .copyin_exit
        nop

        ba      .ci_residue
        nop

.ci_last4:
        ! Can we do 4B ops
        andcc   %o2, 0x3, %g0
        bnz     %ncc, .ci_last2
        nop
1:
        lda     [%i1]ASI_USER, %o2
        st      %o2, [%i0]
        add     %i1, 0x4, %i1
        sub     %i2, 0x4, %i2
        cmp     %i2, 0x4
        bgu,pt  %ncc, 1b
        add     %i0, 0x4, %i0

        brz,pt  %i2, .copyin_exit
        nop

        ba      .ci_residue
        nop

.ci_last2:
        ! Can we do 2B ops
        andcc   %o2, 0x1, %g0
        bnz     %ncc, .ci_residue
        nop

1:
        lduha   [%i1]ASI_USER, %o2
        stuh    %o2, [%i0]
        add     %i1, 0x2, %i1
        sub     %i2, 0x2, %i2
        cmp     %i2, 0x2
        bgu,pt  %ncc, 1b
        add     %i0, 0x2, %i0

        brz,pt  %i2, .copyin_exit
        nop

        ! Copy the residue as byte copy
.ci_residue:
        lduba   [%i1]ASI_USER, %i4
        stb     %i4, [%i0]
        inc     %i1
        deccc   %i2
        bgu,pt  %xcc, .ci_residue
        inc     %i0

.copyin_exit:
        membar  #Sync
        stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
        ret
        restore %g0, 0, %o0
.copyin_err:
        ldn     [THREAD_REG + T_COPYOPS], %o4
        brz     %o4, 2f
        nop
        ldn     [%o4 + CP_COPYIN], %g2
        jmp     %g2
        nop
2:
        retl
        mov     -1, %o0
#endif  /* NIAGARA_IMPL */
        SET_SIZE(copyin)

        ENTRY(xcopyin)
        sethi   %hi(.xcopyin_err), REAL_LOFAULT
        b       .do_copyin
        or      REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
.xcopyin_err:
        ldn     [THREAD_REG + T_COPYOPS], %o4
        brz     %o4, 2f
        nop
        ldn     [%o4 + CP_XCOPYIN], %g2
        jmp     %g2
        nop
2:
        retl
        mov     %g1, %o0
        SET_SIZE(xcopyin)

        ENTRY(xcopyin_little)
        sethi   %hi(.little_err), %o4
        ldn     [THREAD_REG + T_LOFAULT], %o5
        or      %o4, %lo(.little_err), %o4
        membar  #Sync                           ! sync error barrier
        stn     %o4, [THREAD_REG + T_LOFAULT]

        subcc   %g0, %o2, %o3
        add     %o0, %o2, %o0
        bz,pn   %ncc, 2f                ! check for zero bytes
        sub     %o2, 1, %o4
        add     %o0, %o4, %o0           ! start w/last byte
        add     %o1, %o2, %o1
        lduba   [%o0+%o3]ASI_AIUSL, %o4

1:      stb     %o4, [%o1+%o3]
        inccc   %o3
        sub     %o0, 2, %o0             ! get next byte
        bcc,a,pt %ncc, 1b
        lduba   [%o0+%o3]ASI_AIUSL, %o4

2:      membar  #Sync                           ! sync error barrier
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
        retl
        mov     %g0, %o0                ! return (0)

.little_err:
        membar  #Sync                           ! sync error barrier
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
        retl
        mov     %g1, %o0
        SET_SIZE(xcopyin_little)


/*
 * Copy a block of storage - must not overlap (from + len <= to).
 * No fault handler installed (to be called under on_fault())
 */

        ENTRY(copyin_noerr)
        sethi   %hi(.copyio_noerr), REAL_LOFAULT
        b       .do_copyin
        or      REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
.copyio_noerr:
        jmp     SAVED_LOFAULT
        nop
        SET_SIZE(copyin_noerr)

/*
 * Copy a block of storage - must not overlap (from + len <= to).
 * No fault handler installed (to be called under on_fault())
 */

        ENTRY(copyout_noerr)
        sethi   %hi(.copyio_noerr), REAL_LOFAULT
        b       .do_copyout
        or      REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
        SET_SIZE(copyout_noerr)

        .align  4
        DGDEF(use_hw_bcopy)
        .word   1
        DGDEF(use_hw_bzero)
        .word   1
        DGDEF(hw_copy_limit_1)
        .word   0x100
        DGDEF(hw_copy_limit_2)
        .word   0x200
        DGDEF(hw_copy_limit_4)
        .word   0x400
        DGDEF(hw_copy_limit_8)
        .word   0x400

        .align  64
        .section ".text"

/*
 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
 * longer than 256 bytes in length using Niagara's block stores/quad store.
 * If the criteria for using this routine are not met then it calls bzero
 * and returns 1.  Otherwise 0 is returned indicating success.
 * Caller is responsible for ensuring use_hw_bzero is true and that
 * kpreempt_disable() has been called.
 */
        ! %i0 - start address
        ! %i1 - length of region (multiple of 64)

        ENTRY(hwblkclr)
        save    %sp, -SA(MINFRAME), %sp

        ! Must be block-aligned
        andcc   %i0, 0x3f, %g0
        bnz,pn  %ncc, 1f
        nop

        ! ... and must be 256 bytes or more
        cmp     %i1, 0x100
        blu,pn  %ncc, 1f
        nop

        ! ... and length must be a multiple of 64
        andcc   %i1, 0x3f, %g0
        bz,pn   %ncc, .pz_doblock
        mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi

1:      ! punt, call bzero but notify the caller that bzero was used
        mov     %i0, %o0
        call    bzero
        mov     %i1, %o1
        ret
        restore %g0, 1, %o0     ! return (1) - did not use block operations

        ! Already verified that there are at least 256 bytes to set
.pz_doblock:
        stxa    %g0, [%i0+0x0]%asi
        stxa    %g0, [%i0+0x40]%asi
        stxa    %g0, [%i0+0x80]%asi
        stxa    %g0, [%i0+0xc0]%asi

        stxa    %g0, [%i0+0x8]%asi
        stxa    %g0, [%i0+0x10]%asi
        stxa    %g0, [%i0+0x18]%asi
        stxa    %g0, [%i0+0x20]%asi
        stxa    %g0, [%i0+0x28]%asi
        stxa    %g0, [%i0+0x30]%asi
        stxa    %g0, [%i0+0x38]%asi

        stxa    %g0, [%i0+0x48]%asi
        stxa    %g0, [%i0+0x50]%asi
        stxa    %g0, [%i0+0x58]%asi
        stxa    %g0, [%i0+0x60]%asi
        stxa    %g0, [%i0+0x68]%asi
        stxa    %g0, [%i0+0x70]%asi
        stxa    %g0, [%i0+0x78]%asi

        stxa    %g0, [%i0+0x88]%asi
        stxa    %g0, [%i0+0x90]%asi
        stxa    %g0, [%i0+0x98]%asi
        stxa    %g0, [%i0+0xa0]%asi
        stxa    %g0, [%i0+0xa8]%asi
        stxa    %g0, [%i0+0xb0]%asi
        stxa    %g0, [%i0+0xb8]%asi

        stxa    %g0, [%i0+0xc8]%asi
        stxa    %g0, [%i0+0xd0]%asi
        stxa    %g0, [%i0+0xd8]%asi
        stxa    %g0, [%i0+0xe0]%asi
        stxa    %g0, [%i0+0xe8]%asi
        stxa    %g0, [%i0+0xf0]%asi
        stxa    %g0, [%i0+0xf8]%asi

        sub     %i1, 0x100, %i1
        cmp     %i1, 0x100
        bgu,pt  %ncc, .pz_doblock
        add     %i0, 0x100, %i0

2:
        ! Check if more than 64 bytes to set
        cmp     %i1,0x40
        blu     %ncc, .pz_finish
        nop

3:
        stxa    %g0, [%i0+0x0]%asi
        stxa    %g0, [%i0+0x8]%asi
        stxa    %g0, [%i0+0x10]%asi
        stxa    %g0, [%i0+0x18]%asi
        stxa    %g0, [%i0+0x20]%asi
        stxa    %g0, [%i0+0x28]%asi
        stxa    %g0, [%i0+0x30]%asi
        stxa    %g0, [%i0+0x38]%asi

        subcc   %i1, 0x40, %i1
        bgu,pt  %ncc, 3b
        add     %i0, 0x40, %i0

.pz_finish:
        membar  #Sync
        ret
        restore %g0, 0, %o0             ! return (bzero or not)
        SET_SIZE(hwblkclr)

        /*
         * Copy 32 bytes of data from src (%o0) to dst (%o1)
         * using physical addresses.
         */
        ENTRY_NP(hw_pa_bcopy32)
        rdpr    %pstate, %g1
        andn    %g1, PSTATE_IE, %g2
        wrpr    %g0, %g2, %pstate

        ldxa    [%o0]ASI_MEM, %o2
        add     %o0, 8, %o0
        ldxa    [%o0]ASI_MEM, %o3
        add     %o0, 8, %o0
        ldxa    [%o0]ASI_MEM, %o4
        add     %o0, 8, %o0
        ldxa    [%o0]ASI_MEM, %o5
        stxa    %o2, [%o1]ASI_MEM
        add     %o1, 8, %o1
        stxa    %o3, [%o1]ASI_MEM
        add     %o1, 8, %o1
        stxa    %o4, [%o1]ASI_MEM
        add     %o1, 8, %o1
        stxa    %o5, [%o1]ASI_MEM

        membar  #Sync
        retl
        wrpr    %g0, %g1, %pstate
        SET_SIZE(hw_pa_bcopy32)

/*
 * Zero a block of storage.
 *
 * uzero is used by the kernel to zero a block in user address space.
 */

/*
 * Control flow of the bzero/kzero/uzero routine.
 *
 *      For fewer than 7 bytes stores, bytes will be zeroed.
 *
 *      For less than 15 bytes stores, align the address on 4 byte boundary.
 *      Then store as many 4-byte chunks, followed by trailing bytes.
 *
 *      For sizes greater than 15 bytes, align the address on 8 byte boundary.
 *      if (count > 128) {
 *              store as many 8-bytes chunks to block align the address
 *              store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
 *              store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
 *      }
 *      Store as many 8-byte chunks, followed by trailing bytes.
 */

        ENTRY(uzero)
        !
        ! Set a new lo_fault handler only if we came in with one
        ! already specified.
        !
        wr      %g0, ASI_USER, %asi
        ldn     [THREAD_REG + T_LOFAULT], %o5
        tst     %o5
        bz,pt   %ncc, .do_zero
        sethi   %hi(.zeroerr), %o2
        or      %o2, %lo(.zeroerr), %o2
        membar  #Sync
        ba,pt   %ncc, .do_zero
        stn     %o2, [THREAD_REG + T_LOFAULT]

        ENTRY(kzero)
        !
        ! Always set a lo_fault handler
        !
        wr      %g0, ASI_P, %asi
        ldn     [THREAD_REG + T_LOFAULT], %o5
        sethi   %hi(.zeroerr), %o2
        or      %o5, LOFAULT_SET, %o5
        or      %o2, %lo(.zeroerr), %o2
        membar  #Sync
        ba,pt   %ncc, .do_zero
        stn     %o2, [THREAD_REG + T_LOFAULT]

/*
 * We got here because of a fault during kzero or if
 * uzero or bzero was called with t_lofault non-zero.
 * Otherwise we've already run screaming from the room.
 * Errno value is in %g1. Note that we're here iff
 * we did set t_lofault.
 */
.zeroerr:
        !
        ! Undo asi register setting. Just set it to be the
        ! kernel default without checking.
        !
        wr      %g0, ASI_P, %asi

        !
        ! We did set t_lofault. It may well have been zero coming in.
        !
1:
        tst     %o5
        membar #Sync
        bne,pn  %ncc, 3f
        andncc  %o5, LOFAULT_SET, %o5
2:
        !
        ! Old handler was zero. Just return the error.
        !
        retl                            ! return
        mov     %g1, %o0                ! error code from %g1
3:
        !
        ! We're here because %o5 was non-zero. It was non-zero
        ! because either LOFAULT_SET was present, a previous fault
        ! handler was present or both. In all cases we need to reset
        ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
        ! before we either simply return the error or we invoke the
        ! previously specified handler.
        !
        be      %ncc, 2b
        stn     %o5, [THREAD_REG + T_LOFAULT]
        jmp     %o5                     ! goto real handler
        nop
        SET_SIZE(kzero)
        SET_SIZE(uzero)

/*
 * Zero a block of storage.
 */

        ENTRY(bzero)
        wr      %g0, ASI_P, %asi

        ldn     [THREAD_REG + T_LOFAULT], %o5   ! save old vector
        tst     %o5
        bz,pt   %ncc, .do_zero
        sethi   %hi(.zeroerr), %o2
        or      %o2, %lo(.zeroerr), %o2
        membar  #Sync                           ! sync error barrier
        stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector

.do_zero:
        cmp     %o1, 7
        blu,pn  %ncc, .byteclr
        nop

        cmp     %o1, 15
        blu,pn  %ncc, .wdalign
        nop

        andcc   %o0, 7, %o3             ! is add aligned on a 8 byte bound
        bz,pt   %ncc, .blkalign         ! already double aligned
        sub     %o3, 8, %o3             ! -(bytes till double aligned)
        add     %o1, %o3, %o1           ! update o1 with new count

1:
        stba    %g0, [%o0]%asi
        inccc   %o3
        bl,pt   %ncc, 1b
        inc     %o0

        ! Now address is double aligned
.blkalign:
        cmp     %o1, 0x80               ! check if there are 128 bytes to set
        blu,pn  %ncc, .bzero_small
        mov     %o1, %o3

        sethi   %hi(use_hw_bzero), %o2
        ld      [%o2 + %lo(use_hw_bzero)], %o2
        tst     %o2
        bz      %ncc, .bzero_small
        mov     %o1, %o3

        rd      %asi, %o3
        wr      %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
        cmp     %o3, ASI_P
        bne,a   %ncc, .algnblk
        wr      %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi

.algnblk:
        andcc   %o0, 0x3f, %o3          ! is block aligned?
        bz,pt   %ncc, .bzero_blk
        sub     %o3, 0x40, %o3          ! -(bytes till block aligned)
        add     %o1, %o3, %o1           ! o1 is the remainder

        ! Clear -(%o3) bytes till block aligned
1:
        stxa    %g0, [%o0]%asi
        addcc   %o3, 8, %o3
        bl,pt   %ncc, 1b
        add     %o0, 8, %o0

.bzero_blk:
        and     %o1, 0x3f, %o3          ! calc bytes left after blk clear
        andn    %o1, 0x3f, %o4          ! calc size of blocks in bytes

        cmp     %o4, 0x100              ! 256 bytes or more
        blu,pn  %ncc, 3f
        nop

2:
        stxa    %g0, [%o0+0x0]%asi
        stxa    %g0, [%o0+0x40]%asi
        stxa    %g0, [%o0+0x80]%asi
        stxa    %g0, [%o0+0xc0]%asi

        stxa    %g0, [%o0+0x8]%asi
        stxa    %g0, [%o0+0x10]%asi
        stxa    %g0, [%o0+0x18]%asi
        stxa    %g0, [%o0+0x20]%asi
        stxa    %g0, [%o0+0x28]%asi
        stxa    %g0, [%o0+0x30]%asi
        stxa    %g0, [%o0+0x38]%asi

        stxa    %g0, [%o0+0x48]%asi
        stxa    %g0, [%o0+0x50]%asi
        stxa    %g0, [%o0+0x58]%asi
        stxa    %g0, [%o0+0x60]%asi
        stxa    %g0, [%o0+0x68]%asi
        stxa    %g0, [%o0+0x70]%asi
        stxa    %g0, [%o0+0x78]%asi

        stxa    %g0, [%o0+0x88]%asi
        stxa    %g0, [%o0+0x90]%asi
        stxa    %g0, [%o0+0x98]%asi
        stxa    %g0, [%o0+0xa0]%asi
        stxa    %g0, [%o0+0xa8]%asi
        stxa    %g0, [%o0+0xb0]%asi
        stxa    %g0, [%o0+0xb8]%asi

        stxa    %g0, [%o0+0xc8]%asi
        stxa    %g0, [%o0+0xd0]%asi
        stxa    %g0, [%o0+0xd8]%asi
        stxa    %g0, [%o0+0xe0]%asi
        stxa    %g0, [%o0+0xe8]%asi
        stxa    %g0, [%o0+0xf0]%asi
        stxa    %g0, [%o0+0xf8]%asi

        sub     %o4, 0x100, %o4
        cmp     %o4, 0x100
        bgu,pt  %ncc, 2b
        add     %o0, 0x100, %o0

3:
        ! ... check if 64 bytes to set
        cmp     %o4, 0x40
        blu     %ncc, .bzero_blk_done
        nop

4:
        stxa    %g0, [%o0+0x0]%asi
        stxa    %g0, [%o0+0x8]%asi
        stxa    %g0, [%o0+0x10]%asi
        stxa    %g0, [%o0+0x18]%asi
        stxa    %g0, [%o0+0x20]%asi
        stxa    %g0, [%o0+0x28]%asi
        stxa    %g0, [%o0+0x30]%asi
        stxa    %g0, [%o0+0x38]%asi

        subcc   %o4, 0x40, %o4
        bgu,pt  %ncc, 3b
        add     %o0, 0x40, %o0

.bzero_blk_done:
        membar  #Sync
        !
        ! Undo asi register setting.
        !
        rd      %asi, %o4
        wr      %g0, ASI_P, %asi
        cmp     %o4, ASI_BLK_INIT_ST_QUAD_LDD_P
        bne,a   %ncc, .bzero_small
        wr      %g0, ASI_USER, %asi

.bzero_small:
        ! Set the remaining doubles
        subcc   %o3, 8, %o3             ! Can we store any doubles?
        blu,pn  %ncc, .byteclr
        and     %o1, 7, %o1             ! calc bytes left after doubles

.dbclr:
        stxa    %g0, [%o0]%asi          ! Clear the doubles
        subcc   %o3, 8, %o3
        bgeu,pt %ncc, .dbclr
        add     %o0, 8, %o0

        ba      .byteclr
        nop

.wdalign:
        andcc   %o0, 3, %o3             ! is add aligned on a word boundary
        bz,pn   %ncc, .wdclr
        andn    %o1, 3, %o3             ! create word sized count in %o3

        dec     %o1                     ! decrement count
        stba    %g0, [%o0]%asi          ! clear a byte
        ba      .wdalign
        inc     %o0                     ! next byte

.wdclr:
        sta     %g0, [%o0]%asi          ! 4-byte clearing loop
        subcc   %o3, 4, %o3
        bnz,pt  %ncc, .wdclr
        inc     4, %o0

        and     %o1, 3, %o1             ! leftover count, if any

.byteclr:
        ! Set the leftover bytes
        brz     %o1, .bzero_exit
        nop

7:
        deccc   %o1                     ! byte clearing loop
        stba    %g0, [%o0]%asi
        bgu,pt  %ncc, 7b
        inc     %o0

.bzero_exit:
        !
        ! We're just concerned with whether t_lofault was set
        ! when we came in. We end up here from either kzero()
        ! or bzero(). kzero() *always* sets a lofault handler.
        ! It ors LOFAULT_SET into %o5 to indicate it has done
        ! this even if the value of %o5 is otherwise zero.
        ! bzero() sets a lofault handler *only* if one was
        ! previously set. Accordingly we need to examine
        ! %o5 and if it is non-zero be sure to clear LOFAULT_SET
        ! before resetting the error handler.
        !
        tst     %o5
        bz      %ncc, 1f
        andn    %o5, LOFAULT_SET, %o5
        membar  #Sync                           ! sync error barrier
        stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1:
        retl
        clr     %o0                     ! return (0)

        SET_SIZE(bzero)
Illumos