root/usr/src/lib/libc/amd64/gen/strcpy.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, Intel Corporation
 * All rights reserved.
 */

/*
 *      str[n]cpy - copy [n] chars from second operand into first operand
 */
#include "SYS.h"
#include "proc64_id.h"

#define LABEL(s) .strcpy##s

#ifdef USE_AS_STRNCPY
        ENTRY(strncpy)
        test    %edx, %edx
        jz      LABEL(strncpy_exitz)
        mov     %rdx, %r8
#else
        ENTRY(strcpy)                           /* (char *, const char *) */
        xor     %rdx, %rdx
#endif
        mov     %esi, %ecx
        and     $0xfffffffffffffff0, %rsi       /* force rsi 16 byte align */
        and     $0xf, %rcx
        mov     %rdi, %rax                      /* save destination address for return value */


        pxor    %xmm0, %xmm0                    /* clear %xmm0 for null char checks */
        pcmpeqb (%rsi), %xmm0                   /* check 16 bytes in src for null */
        pmovmskb %xmm0, %edx
        shr     %cl, %edx                       /* adjust for offset from 16byte boundary */
        test    %edx, %edx                      /* edx will be 0 if chars are non-null */
        jnz     LABEL(less16bytes)              /* null char found in first 16 bytes examined */
#ifdef USE_AS_STRNCPY
        /*
         * Check if the count is satisfied in first 16 bytes examined.
         */
        lea     -16(%r8, %rcx), %r11
        cmp     $0, %r11
        jle     LABEL(less16bytes)
#endif
        mov     %rcx, %r9                       /* rsi alignment offset */
        or      %edi, %ecx
        and     $0xf, %ecx
        lea     -16(%r9), %r10
        jz      LABEL(ashr_0)                   /* src and dest are both 16 byte aligned */

        neg     %r10                            /* max src bytes remaining in current dqword */

        pxor    %xmm0, %xmm0                    /* clear %xmm0, may be polluted by unaligned operation */
        pcmpeqb 16(%rsi), %xmm0                 /* check next 16 bytes in src for a null */
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(less32bytes)              /* null char found in first 32 bytes examined */

#ifdef USE_AS_STRNCPY
        /*
         * If strncpy count <= 16 go to exit case
         */
        sub     $16, %r8
        jbe     LABEL(less32bytes_strncpy_truncation)
#endif
        /*
         * At least 16 bytes to copy to destination string. Move them now.
         * Don't worry about alignment.
         */
        mov     (%rsi, %r9), %rdx
        mov     %rdx, (%rdi)
        mov     8(%rsi, %r9), %rdx
        mov     %rdx, 8(%rdi)

        /*
         * so far destination rdi may be aligned by 16, re-calculate rsi and
         * jump to corresponding src/dest relative offset case.
         *      rcx is offset of rsi
         *      rdx is offset of rdi
         */
        and     $0xfffffffffffffff0, %rdi       /* force rdi 16 byte align */
        mov     %rax, %rdx                      /* rax contains orignal rdi */
        xor     %rdi, %rdx                      /* same effect as "and $0xf, %rdx" */
#ifdef USE_AS_STRNCPY
        /*
         * Will now do 16 byte aligned stores. Stores may overlap some bytes
         * (ie store twice) if destination was unaligned. Compensate here.
         */
        add     %rdx, %r8                       /* compensate for overlap */
#endif

        add     $16, %rdi                       /* next 16 bytes for dest */

        /*
         * align src to 16-byte boundary. Could be up or down depending on
         * whether src offset - dest offset > 0 (up) or
         *  src offset - dest offset < 0 (down).
         */
        sub     %rdx, %r9                       /* src offset - dest offset */

        lea     16(%r9, %rsi), %rsi
        mov     %esi, %ecx                      /* for new src offset */
        and     $0xfffffffffffffff0, %rsi       /* force rsi 16 byte align */

        and     $0xf, %ecx                      /* new src offset is 0 if rsi/rdi have same alignment */
        jz      LABEL(ashr_0)

#ifdef USE_AS_STRNCPY
        xor     %edx, %edx                      /* In case unaligned_exit is taken */
#endif
        /*
         * Jump to case corresponding to source/dest string relative offsets
         * Index = (16 + (src offset - dest offset)) % 16
         */
        lea     -16(%rcx), %r10
        mov     %rcx, %r9
        neg     %r10                            /* max src bytes remaining in current dqword */
        lea     LABEL(unaligned_table)(%rip), %r11
        movslq  (%r11, %rcx, 4), %rcx
        lea     (%r11, %rcx), %rcx
        jmp     *%rcx

/*
 * ashr_0 handles the following cases:
 *      src alignment offset = dest alignment offset
 */
        .p2align 5
LABEL(ashr_0):
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_aligned)
#endif
        movdqa  (%rsi), %xmm1           /* fetch 16 bytes from src string */
        movdqa  %xmm1, (%rdi)           /* store 16 bytes into dest string */
        add     $16, %rsi
        add     $16, %rdi
        pcmpeqb (%rsi), %xmm0           /* check 16 bytes in src for a null */
        pmovmskb %xmm0, %edx

        test    %edx, %edx              /* edx will be 0 if chars are non-null */
        jnz     LABEL(aligned_16bytes)  /* exit tail */

LABEL(ashr_0_loop):
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_aligned)
#endif
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  %xmm1, (%rdi, %rcx)
        add     $16, %rcx
        pcmpeqb (%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(aligned_exit)

#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_aligned)
#endif
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  %xmm1, (%rdi, %rcx)
        add     $16, %rcx
        pcmpeqb  (%rsi, %rcx), %xmm0
        pmovmskb  %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(aligned_exit)

#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_aligned)
#endif
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  %xmm1, (%rdi, %rcx)

        add     $16, %rcx
        pcmpeqb  (%rsi, %rcx), %xmm0
        pmovmskb  %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(aligned_exit)

#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_aligned)
#endif
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  %xmm1, (%rdi, %rcx)
        add     $16, %rcx
        pcmpeqb  (%rsi, %rcx), %xmm0
        pmovmskb  %xmm0, %edx
        test    %edx, %edx
        jz      LABEL(ashr_0_loop)
        jmp     LABEL(aligned_exit)


/*
 * ashr_15 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 15
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_15):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_15_use_sse2)

        .p2align 4
LABEL(ashr_15_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $15, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0f

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $15, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0f

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_15_use_ssse3)

        .p2align 4
LABEL(ashr_15_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $15, %xmm2
        pslldq  $1, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $15, %xmm2
        pslldq  $1, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_15_use_sse2)


/*
 * ashr_14 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 14
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_14):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_14_use_sse2)

        .p2align 4
LABEL(ashr_14_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $14, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0e

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $14, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0e

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_14_use_ssse3)

        .p2align 4
LABEL(ashr_14_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $14, %xmm2
        pslldq  $2, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $14, %xmm2
        pslldq  $2, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_14_use_sse2)


/*
 * ashr_13 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 13
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_13):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_13_use_sse2)

        .p2align 4
LABEL(ashr_13_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $13, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0d

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $13, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0d

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_13_use_ssse3)

        .p2align 4
LABEL(ashr_13_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $13, %xmm2
        pslldq  $3, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $13, %xmm2
        pslldq  $3, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_13_use_sse2)


/*
 * ashr_12 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 12
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_12):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_12_use_sse2)

        .p2align 4
LABEL(ashr_12_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $12, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0c

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $12, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0c

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_12_use_ssse3)

        .p2align 4
LABEL(ashr_12_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $12, %xmm2
        pslldq  $4, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $12, %xmm2
        pslldq  $4, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_12_use_sse2)


/*
 * ashr_11 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 11
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_11):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_11_use_sse2)

        .p2align 4
LABEL(ashr_11_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $11, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0b

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $11, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0b

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_11_use_ssse3)

        .p2align 4
LABEL(ashr_11_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $11, %xmm2
        pslldq  $5, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $11, %xmm2
        pslldq  $5, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_11_use_sse2)


/*
 * ashr_10 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 10
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_10):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_10_use_sse2)

        .p2align 4
LABEL(ashr_10_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $10, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0a

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $10, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x0a

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_10_use_ssse3)

        .p2align 4
LABEL(ashr_10_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $10, %xmm2
        pslldq  $6, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $10, %xmm2
        pslldq  $6, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_10_use_sse2)


/*
 * ashr_9 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 9
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_9):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_9_use_sse2)

        .p2align 4
LABEL(ashr_9_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $9, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x09

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $9, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x09

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_9_use_ssse3)

        .p2align 4
LABEL(ashr_9_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $9, %xmm2
        pslldq  $7, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $9, %xmm2
        pslldq  $7, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_9_use_sse2)


/*
 * ashr_8 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 8
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_8):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_8_use_sse2)

        .p2align 4
LABEL(ashr_8_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $8, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x08

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $8, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x08

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_8_use_ssse3)

        .p2align 4
LABEL(ashr_8_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $8, %xmm2
        pslldq  $8, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $8, %xmm2
        pslldq  $8, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_8_use_sse2)


/*
 * ashr_7 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 7
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_7):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_7_use_sse2)

        .p2align 4
LABEL(ashr_7_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $7, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x07

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $7, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x07

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_7_use_ssse3)

        .p2align 4
LABEL(ashr_7_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $7, %xmm2
        pslldq  $9, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $7, %xmm2
        pslldq  $9, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_7_use_sse2)


/*
 * ashr_6 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 6
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_6):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_6_use_sse2)

        .p2align 4
LABEL(ashr_6_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $6, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x06

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $6, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x06

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_6_use_ssse3)

        .p2align 4
LABEL(ashr_6_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $6, %xmm2
        pslldq  $10, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $6, %xmm2
        pslldq  $10, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_6_use_sse2)


/*
 * ashr_5 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 5
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_5):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_5_use_sse2)

        .p2align 4
LABEL(ashr_5_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $5, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x05

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $5, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x05

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_5_use_ssse3)

        .p2align 4
LABEL(ashr_5_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $5, %xmm2
        pslldq  $11, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $5, %xmm2
        pslldq  $11, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_5_use_sse2)


/*
 * ashr_4 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 4
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_4):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_4_use_sse2)

        .p2align 4
LABEL(ashr_4_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $4, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x04

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $4, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x04

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_4_use_ssse3)

        .p2align 4
LABEL(ashr_4_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $4, %xmm2
        pslldq  $12, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $4, %xmm2
        pslldq  $12, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_4_use_sse2)


/*
 * ashr_3 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 3
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_3):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_3_use_sse2)

        .p2align 4
LABEL(ashr_3_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $3, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x03

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $3, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x03

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_3_use_ssse3)

        .p2align 4
LABEL(ashr_3_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $3, %xmm2
        pslldq  $13, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $3, %xmm2
        pslldq  $13, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_3_use_sse2)


/*
 * ashr_2 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 2
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_2):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_2_use_sse2)

        .p2align 4
LABEL(ashr_2_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $2, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x02

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $2, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x02

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_2_use_ssse3)

        .p2align 4
LABEL(ashr_2_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $2, %xmm2
        pslldq  $14, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $2, %xmm2
        pslldq  $14, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_2_use_sse2)


/*
 * ashr_1 handles the following cases:
 *      (16 + (src offset - dest offset)) % 16 = 1
 *
 * Based on above operation, start from (%r9 + rsi) to the left of this cache
 * bank, there is no null byte.
 */
        .p2align 4
LABEL(ashr_1):
        xor     %ecx, %ecx                              /* clear index */
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        testl   $USE_SSSE3, .memops_method(%rip)        /* use sse2 or ssse3? */
        jz      LABEL(ashr_1_use_sse2)

        .p2align 4
LABEL(ashr_1_use_ssse3):
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        #palignr $1, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x01

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        pcmpeqb %xmm3, %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif
        #palignr $1, (%rsi, %rcx), %xmm3
        .byte   0x66, 0x0F, 0x3A ,0x0F
        .byte   0x1c, 0x0e, 0x01

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_1_use_ssse3)

        .p2align 4
LABEL(ashr_1_use_sse2):
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif
        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $1, %xmm2
        pslldq  $15, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx

#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        pcmpeqb 16(%rsi, %rcx), %xmm0
        pmovmskb %xmm0, %edx
        test    %edx, %edx
        jnz     LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
        sub     $16, %r8
        jbe     LABEL(strncpy_truncation_unaligned)
#endif

        movdqa  16(%rsi, %rcx), %xmm3
        movdqa  (%rsi, %rcx), %xmm2

        psrldq  $1, %xmm2
        pslldq  $15, %xmm3
        por     %xmm2, %xmm3

        movdqa  %xmm3, (%rdi, %rcx)
        add     $16, %rcx
#ifdef USE_AS_STRNCPY
        cmp     %r10, %r8
        jbe     LABEL(unaligned_exit)
#endif
        jmp     LABEL(ashr_1_use_sse2)


        /*
         * Exit tail code:
         * Up to 32 bytes are copied in the case of strcpy.
         */
        .p2align 4
LABEL(less32bytes):
        xor     %ecx, %ecx
LABEL(unaligned_exit):
        add     %r9, %rsi               /* r9 holds offset of rsi */
        mov     %rcx, %r9
        mov     %r10, %rcx
        shl     %cl, %edx               /* after shl, calculate the exact number to be filled */
        mov     %r9, %rcx
        .p2align 4
LABEL(aligned_exit):
        add     %rcx, %rdi              /* locate exact address for rdi */
LABEL(less16bytes):
        add     %rcx, %rsi              /* locate exact address for rsi */
LABEL(aligned_16bytes):
#ifdef USE_AS_STRNCPY
        /*
         * Null found in 16bytes checked. Set bit in bitmask corresponding to
         * the strncpy count argument. We will copy to the null (inclusive)
         * or count whichever comes first.
         */
        mov     $1, %r9d
        lea     -1(%r8), %rcx
        shl     %cl, %r9d
        cmp     $32, %r8
        ja      LABEL(strncpy_tail)
        or      %r9d, %edx
LABEL(strncpy_tail):
#endif
        /*
         * Check to see if BSF is fast on this processor. If not, use a
         * different exit tail.
         */
        testb   $USE_BSF, .memops_method(%rip)
        jz      LABEL(AMD_exit)
        bsf     %rdx, %rcx              /* Find byte with null char */
        lea     LABEL(tail_table)(%rip), %r11
        movslq  (%r11, %rcx, 4), %rcx
        lea     (%r11, %rcx), %rcx
        jmp     *%rcx

#ifdef USE_AS_STRNCPY
        /*
         * Count reached before null found.
         */
        .p2align 4
LABEL(less32bytes_strncpy_truncation):
        xor     %ecx, %ecx
LABEL(strncpy_truncation_unaligned):
        add     %r9, %rsi               /* next src char to copy */
LABEL(strncpy_truncation_aligned):
        add     %rcx, %rdi
        add     %rcx, %rsi
        add     $16, %r8                /* compensation */
        lea     -1(%r8), %rcx
        lea     LABEL(tail_table)(%rip), %r11
        movslq  (%r11, %rcx, 4), %rcx
        lea     (%r11, %rcx), %rcx
        jmp     *%rcx

        .p2align 4
LABEL(strncpy_exitz):
        mov     %rdi, %rax
        ret
#endif

        .p2align 4
LABEL(AMD_exit):
        test    %dl, %dl
        jz      LABEL(AMD_exit_more_8)
        test    $0x01, %dl
        jnz     LABEL(tail_0)
        test    $0x02, %dl
        jnz     LABEL(tail_1)
        test    $0x04, %dl
        jnz     LABEL(tail_2)
        test    $0x08, %dl
        jnz     LABEL(tail_3)
        test    $0x10, %dl
        jnz     LABEL(tail_4)
        test    $0x20, %dl
        jnz     LABEL(tail_5)
        test    $0x40, %dl
        jnz     LABEL(tail_6)

        .p2align 4
LABEL(tail_7):                          /* 8 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
#ifdef USE_AS_STRNCPY
        mov     $8, %cl
        sub     $8, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

#ifdef USE_AS_STRNCPY
        /*
         * Null terminated src string shorter than count. Fill the rest of the
         * destination with null chars.
         */
        .p2align 4
LABEL(strncpy_fill_tail):
        mov     %rax, %rdx
        movzx   %cl, %rax
        mov     %r8, %rcx
        add     %rax, %rdi
        xor     %eax, %eax
        shr     $3, %ecx
        jz      LABEL(strncpy_fill_less_8)

        rep     stosq
LABEL(strncpy_fill_less_8):
        mov     %r8, %rcx
        and     $7, %rcx
        jz      LABEL(strncpy_fill_return)
LABEL(strncpy_fill_less_7):
        sub     $1, %ecx
        mov     %al, (%rdi, %rcx)
        jnz     LABEL(strncpy_fill_less_7)
LABEL(strncpy_fill_return):
        mov     %rdx, %rax
        ret
#endif

        .p2align 4
LABEL(tail_0):                          /* 1 byte */
        mov     (%rsi), %cl
        mov     %cl, (%rdi)
#ifdef USE_AS_STRNCPY
        mov     $1, %cl
        sub     $1, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_1):                          /* 2 bytes */
        mov     (%rsi), %cx
        mov     %cx, (%rdi)
#ifdef USE_AS_STRNCPY
        mov     $2, %cl
        sub     $2, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_2):                          /* 3 bytes */
        mov     (%rsi), %cx
        mov     %cx, (%rdi)
        mov     1(%rsi), %cx
        mov     %cx, 1(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $3, %cl
        sub     $3, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_3):                          /* 4 bytes */
        mov     (%rsi), %ecx
        mov     %ecx, (%rdi)
#ifdef USE_AS_STRNCPY
        mov     $4, %cl
        sub     $4, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_4):                          /* 5 bytes */
        mov     (%rsi), %ecx
        mov     %ecx, (%rdi)
        mov     1(%rsi), %edx
        mov     %edx, 1(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $5, %cl
        sub     $5, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_5):                          /* 6 bytes */
        mov     (%rsi), %ecx
        mov     %ecx, (%rdi)
        mov     2(%rsi), %edx
        mov     %edx, 2(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $6, %cl
        sub     $6, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_6):                          /* 7 bytes */
        mov     (%rsi), %ecx
        mov     %ecx, (%rdi)
        mov     3(%rsi), %edx
        mov     %edx,3(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $7, %cl
        sub     $7, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_8):                          /* 9 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     5(%rsi), %edx
        mov     %edx, 5(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $9, %cl
        sub     $9, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(AMD_exit_more_8):
        test    %dh, %dh
        jz      LABEL(AMD_exit_more_16)
        test    $0x01, %dh
        jnz     LABEL(tail_8)
        test    $0x02, %dh
        jnz     LABEL(tail_9)
        test    $0x04, %dh
        jnz     LABEL(tail_10)
        test    $0x08, %dh
        jnz     LABEL(tail_11)
        test    $0x10, %dh
        jnz     LABEL(tail_12)
        test    $0x20, %dh
        jnz     LABEL(tail_13)
        test    $0x40, %dh
        jnz     LABEL(tail_14)

        .p2align 4
LABEL(tail_15):                         /* 16 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $16, %cl
        sub     $16, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_9):                          /* 10 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     6(%rsi), %edx
        mov     %edx, 6(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $10, %cl
        sub     $10, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_10):                         /* 11 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     7(%rsi), %edx
        mov     %edx, 7(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $11, %cl
        sub     $11, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_11):                         /* 12 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %edx
        mov     %edx, 8(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $12, %cl
        sub     $12, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_12):                         /* 13 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     5(%rsi), %rcx
        mov     %rcx, 5(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $13, %cl
        sub     $13, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_13):                         /* 14 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     6(%rsi), %rcx
        mov     %rcx, 6(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $14, %cl
        sub     $14, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_14):                         /* 15 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     7(%rsi), %rcx
        mov     %rcx, 7(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $15, %cl
        sub     $15, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(AMD_exit_more_16):
        shr     $16, %edx
        test    %dl, %dl
        jz      LABEL(AMD_exit_more_24)
        test    $0x01, %dl
        jnz     LABEL(tail_16)
        test    $0x02, %dl
        jnz     LABEL(tail_17)
        test    $0x04, %dl
        jnz     LABEL(tail_18)
        test    $0x08, %dl
        jnz     LABEL(tail_19)
        test    $0x10, %dl
        jnz     LABEL(tail_20)
        test    $0x20, %dl
        jnz     LABEL(tail_21)
        test    $0x40, %dl
        jnz     LABEL(tail_22)

        .p2align 4
LABEL(tail_23):                         /* 24 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %rcx
        mov     %rcx, 16(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $24, %cl
        sub     $24, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_16):                         /* 17 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %cl
        mov     %cl, 16(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $17, %cl
        sub     $17, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_17):                         /* 18 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %cx
        mov     %cx, 16(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $18, %cl
        sub     $18, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_18):                         /* 19 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     15(%rsi), %ecx
        mov     %ecx,15(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $19, %cl
        sub     $19, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_19):                         /* 20 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %ecx
        mov     %ecx, 16(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $20, %cl
        sub     $20, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_20):                         /* 21 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     13(%rsi), %rcx
        mov     %rcx, 13(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $21, %cl
        sub     $21, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_21):                         /* 22 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     14(%rsi), %rcx
        mov     %rcx, 14(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $22, %cl
        sub     $22, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_22):                         /* 23 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     15(%rsi), %rcx
        mov     %rcx, 15(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $23, %cl
        sub     $23, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(AMD_exit_more_24):
        test    $0x01, %dh
        jnz     LABEL(tail_24)
        test    $0x02, %dh
        jnz     LABEL(tail_25)
        test    $0x04, %dh
        jnz     LABEL(tail_26)
        test    $0x08, %dh
        jnz     LABEL(tail_27)
        test    $0x10, %dh
        jnz     LABEL(tail_28)
        test    $0x20, %dh
        jnz     LABEL(tail_29)
        test    $0x40, %dh
        jnz     LABEL(tail_30)

        .p2align 4
LABEL(tail_31):                         /* 32 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %rcx
        mov     %rcx, 16(%rdi)
        mov     24(%rsi), %rdx
        mov     %rdx, 24(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $32, %cl
        sub     $32, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_24):                         /* 25 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %rcx
        mov     %rcx, 16(%rdi)
        mov     21(%rsi), %edx
        mov     %edx, 21(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $25, %cl
        sub     $25, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_25):                         /* 26 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %rcx
        mov     %rcx, 16(%rdi)
        mov     22(%rsi), %edx
        mov     %edx, 22(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $26, %cl
        sub     $26, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_26):                         /* 27 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %rcx
        mov     %rcx, 16(%rdi)
        mov     23(%rsi), %edx
        mov     %edx, 23(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $27, %cl
        sub     $27, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_27):                         /* 28 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %rcx
        mov     %rcx, 16(%rdi)
        mov     24(%rsi), %edx
        mov     %edx, 24(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $28, %cl
        sub     $28, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_28):                         /* 29 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %rcx
        mov     %rcx, 16(%rdi)
        mov     21(%rsi), %rdx
        mov     %rdx, 21(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $29, %cl
        sub     $29, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_29):                         /* 30 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %rcx
        mov     %rcx, 16(%rdi)
        mov     22(%rsi), %rdx
        mov     %rdx, 22(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $30, %cl
        sub     $30, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .p2align 4
LABEL(tail_30):                         /* 31 bytes */
        mov     (%rsi), %rcx
        mov     %rcx, (%rdi)
        mov     8(%rsi), %rdx
        mov     %rdx, 8(%rdi)
        mov     16(%rsi), %rcx
        mov     %rcx, 16(%rdi)
        mov     23(%rsi), %rdx
        mov     %rdx, 23(%rdi)
#ifdef USE_AS_STRNCPY
        mov     $31, %cl
        sub     $31, %r8
        jnz     LABEL(strncpy_fill_tail)
#endif
        ret

        .pushsection .rodata
        .p2align 4
LABEL(tail_table):
        .int    LABEL(tail_0) - LABEL(tail_table)       /* 1 byte */
        .int    LABEL(tail_1) - LABEL(tail_table)
        .int    LABEL(tail_2) - LABEL(tail_table)
        .int    LABEL(tail_3) - LABEL(tail_table)
        .int    LABEL(tail_4) - LABEL(tail_table)
        .int    LABEL(tail_5) - LABEL(tail_table)
        .int    LABEL(tail_6) - LABEL(tail_table)
        .int    LABEL(tail_7) - LABEL(tail_table)
        .int    LABEL(tail_8) - LABEL(tail_table)
        .int    LABEL(tail_9) - LABEL(tail_table)
        .int    LABEL(tail_10) - LABEL(tail_table)
        .int    LABEL(tail_11) - LABEL(tail_table)
        .int    LABEL(tail_12) - LABEL(tail_table)
        .int    LABEL(tail_13) - LABEL(tail_table)
        .int    LABEL(tail_14) - LABEL(tail_table)
        .int    LABEL(tail_15) - LABEL(tail_table)
        .int    LABEL(tail_16) - LABEL(tail_table)
        .int    LABEL(tail_17) - LABEL(tail_table)
        .int    LABEL(tail_18) - LABEL(tail_table)
        .int    LABEL(tail_19) - LABEL(tail_table)
        .int    LABEL(tail_20) - LABEL(tail_table)
        .int    LABEL(tail_21) - LABEL(tail_table)
        .int    LABEL(tail_22) - LABEL(tail_table)
        .int    LABEL(tail_23) - LABEL(tail_table)
        .int    LABEL(tail_24) - LABEL(tail_table)
        .int    LABEL(tail_25) - LABEL(tail_table)
        .int    LABEL(tail_26) - LABEL(tail_table)
        .int    LABEL(tail_27) - LABEL(tail_table)
        .int    LABEL(tail_28) - LABEL(tail_table)
        .int    LABEL(tail_29) - LABEL(tail_table)
        .int    LABEL(tail_30) - LABEL(tail_table)
        .int    LABEL(tail_31) - LABEL(tail_table)      /* 32 bytes */

        .p2align 4
LABEL(unaligned_table):
        .int    LABEL(ashr_0) - LABEL(unaligned_table)
        .int    LABEL(ashr_1) - LABEL(unaligned_table)
        .int    LABEL(ashr_2) - LABEL(unaligned_table)
        .int    LABEL(ashr_3) - LABEL(unaligned_table)
        .int    LABEL(ashr_4) - LABEL(unaligned_table)
        .int    LABEL(ashr_5) - LABEL(unaligned_table)
        .int    LABEL(ashr_6) - LABEL(unaligned_table)
        .int    LABEL(ashr_7) - LABEL(unaligned_table)
        .int    LABEL(ashr_8) - LABEL(unaligned_table)
        .int    LABEL(ashr_9) - LABEL(unaligned_table)
        .int    LABEL(ashr_10) - LABEL(unaligned_table)
        .int    LABEL(ashr_11) - LABEL(unaligned_table)
        .int    LABEL(ashr_12) - LABEL(unaligned_table)
        .int    LABEL(ashr_13) - LABEL(unaligned_table)
        .int    LABEL(ashr_14) - LABEL(unaligned_table)
        .int    LABEL(ashr_15) - LABEL(unaligned_table)
        .popsection

#ifdef USE_AS_STRNCPY
        SET_SIZE(strncpy)
#else
        SET_SIZE(strcpy)                        /* (char *, const char *) */
#endif