root/usr/src/lib/libc/amd64/gen/strcmp.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, Intel Corporation
 * All rights reserved.
 */

/*
 *      str[n]cmp - compare chars between two string
 */

#include "SYS.h"
#include "proc64_id.h"

#define LABEL(s) .strcmp##s

#ifdef USE_AS_STRNCMP
        /*
         * Since the counter, %r11, is unsigned, we branch to strcmp_exitz
         * if the new counter > the old one or is 0.
         */
#define UPDATE_STRNCMP_COUNTER                          \
        /* calculate left number to compare */          \
        lea     -16(%rcx, %r11), %r9;                   \
        cmp     %r9, %r11;                              \
        jb      LABEL(strcmp_exitz);                    \
        test    %r9, %r9;                               \
        je      LABEL(strcmp_exitz);                    \
        mov     %r9, %r11
#else
#define UPDATE_STRNCMP_COUNTER
#endif

        /*
         * This implementation uses SSE to compare up to 16 bytes at a time.
         */
#ifdef USE_AS_STRNCMP
        ENTRY(strncmp)
        test    %rdx, %rdx
        je      LABEL(strcmp_exitz)
        mov     %rdx, %r11
#else
        ENTRY(strcmp)                   /* (const char *, const char *) */
#endif
        mov     %esi, %ecx
        mov     %edi, %eax
        and     $0x3f, %rcx             /* rsi alignment in cache line */
        and     $0x3f, %rax             /* rdi alignment in cache line */
        cmp     $0x30, %ecx
        ja      LABEL(crosscache)       /* rsi: 16-byte load will cross cache line */
        cmp     $0x30, %eax
        ja      LABEL(crosscache)       /* rdi: 16-byte load will cross cache line */
        movlpd  (%rdi), %xmm1
        movlpd  (%rsi), %xmm2
        movhpd  8(%rdi), %xmm1
        movhpd  8(%rsi), %xmm2
        pxor    %xmm0, %xmm0            /* clear %xmm0 for null char checks */
        pcmpeqb %xmm1, %xmm0            /* Any null chars? */
        pcmpeqb %xmm2, %xmm1            /* compare first 16 bytes for equality */
        psubb   %xmm0, %xmm1            /* packed sub of comparison results*/
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx           /* if first 16 bytes are same, edx == 0xffff */
        jnz     LABEL(less16bytes)      /* If not, found mismatch or null char */
#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)     /* finish comparision */
#endif
        add     $16, %rsi               /* prepare to search next 16 bytes */
        add     $16, %rdi               /* prepare to search next 16 bytes */

        /*
         * Determine rdi and rsi string offsets from 16-byte alignment.
         * Use relative offset difference between the two to determine which case
         * below to use.
         */
        .p2align 4
LABEL(crosscache):
        and     $0xfffffffffffffff0, %rsi       /* force %rsi to be 16 byte aligned */
        and     $0xfffffffffffffff0, %rdi       /* force %rdi to be 16 byte aligned */
        mov     $0xffff, %edx                   /* for equivalent offset */
        xor     %r8d, %r8d
        and     $0xf, %ecx                      /* offset of rsi */
        and     $0xf, %eax                      /* offset of rdi */
        cmp     %eax, %ecx
        je      LABEL(ashr_0)                   /* both strings have the same alignment */
        ja      LABEL(bigger)
        mov     %edx, %r8d                      /* r8d is offset flag for exit tail */
        xchg    %ecx, %eax
        xchg    %rsi, %rdi
LABEL(bigger):
        mov     %rcx, %r9
        sub     %rax, %r9
        lea     LABEL(unaligned_table)(%rip), %r10
        movslq  (%r10, %r9, 4), %r9
        lea     (%r10, %r9), %r10
        jmp     *%r10                           /* jump to corresponding case */

/*
 * ashr_0 handles the following cases:
 *      str1 offset = str2 offset
 */
        .p2align 4
LABEL(ashr_0):
        movdqa  (%rsi), %xmm1
        pxor    %xmm0, %xmm0                    /* clear %xmm0 for null char check */
        pcmpeqb %xmm1, %xmm0                    /* Any null chars? */
        pcmpeqb (%rdi), %xmm1                   /* compare 16 bytes for equality */
        psubb   %xmm0, %xmm1                    /* packed sub of comparison results*/
        pmovmskb %xmm1, %r9d
        shr     %cl, %edx                       /* adjust 0xffff for offset */
        shr     %cl, %r9d                       /* adjust for 16-byte offset */
        sub     %r9d, %edx
        /*
         * edx must be the same with r9d if in left byte (16-rcx) is equal to
         * the start from (16-rax) and no null char was seen.
         */
        jne     LABEL(less32bytes)              /* mismatch or null char */
        UPDATE_STRNCMP_COUNTER
        mov     $16, %rcx
        mov     $16, %r9
        pxor    %xmm0, %xmm0                    /* clear xmm0, may have changed above */

        /*
         * Now both strings are aligned at 16-byte boundary. Loop over strings
         * checking 32-bytes per iteration.
         */
        .p2align 4
LABEL(loop_ashr_0):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)             /* mismatch or null char seen */

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        add     $16, %rcx
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        add     $16, %rcx
        jmp     LABEL(loop_ashr_0)

/*
 * ashr_1 handles the following cases:
 *      abs(str1 offset - str2 offset) = 15
 */
        .p2align 4
LABEL(ashr_1):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0            /* Any null chars? */
        pslldq  $15, %xmm2              /* shift first string to align with second */
        pcmpeqb %xmm1, %xmm2            /* compare 16 bytes for equality */
        psubb   %xmm0, %xmm2            /* packed sub of comparison results*/
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx               /* adjust 0xffff for offset */
        shr     %cl, %r9d               /* adjust for 16-byte offset */
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)      /* mismatch or null char seen */
        movdqa  (%rdi), %xmm3
        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx               /* index for loads */
        mov     $1, %r9d                /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     1(%rdi), %r10
        and     $0xfff, %r10            /* offset into 4K page */
        sub     $0x1000, %r10           /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_1):
        add     $16, %r10
        jg      LABEL(nibble_ashr_1)    /* cross page boundary */

LABEL(gobble_ashr_1):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4             /* store for next cycle */

        psrldq  $1, %xmm3
        pslldq  $15, %xmm2
        por     %xmm3, %xmm2            /* merge into one 16byte value */

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_1)    /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4            /* store for next cycle */

        psrldq  $1, %xmm3
        pslldq  $15, %xmm2
        por     %xmm3, %xmm2            /* merge into one 16byte value */

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_1)

        /*
         * Nibble avoids loads across page boundary. This is to avoid a potential
         * access into unmapped memory.
         */
        .p2align 4
LABEL(nibble_ashr_1):
        psrldq  $1, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x7fff, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $15, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_1)

/*
 * ashr_2 handles the following cases:
 *      abs(str1 offset - str2 offset) = 14
 */
        .p2align 4
LABEL(ashr_2):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $14, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3
        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $2, %r9d        /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     2(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_2):
        add     $16, %r10
        jg      LABEL(nibble_ashr_2)

LABEL(gobble_ashr_2):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $2, %xmm3
        pslldq  $14, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_2)    /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $2, %xmm3
        pslldq  $14, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_2)

        .p2align 4
LABEL(nibble_ashr_2):
        psrldq  $2, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x3fff, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $14, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_2)

/*
 * ashr_3 handles the following cases:
 *      abs(str1 offset - str2 offset) = 13
 */
        .p2align 4
LABEL(ashr_3):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $13, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $3, %r9d        /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     3(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_3):
        add     $16, %r10
        jg      LABEL(nibble_ashr_3)

LABEL(gobble_ashr_3):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $3, %xmm3
        pslldq  $13, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_3)    /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $3, %xmm3
        pslldq  $13, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_3)

        .p2align 4
LABEL(nibble_ashr_3):
        psrldq  $3, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x1fff, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $13, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_3)

/*
 * ashr_4 handles the following cases:
 *      abs(str1 offset - str2 offset) = 12
 */
        .p2align 4
LABEL(ashr_4):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $12, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $4, %r9d        /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     4(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_4):
        add     $16, %r10
        jg      LABEL(nibble_ashr_4)

LABEL(gobble_ashr_4):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $4, %xmm3
        pslldq  $12, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_4)    /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $4, %xmm3
        pslldq  $12, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_4)

        .p2align 4
LABEL(nibble_ashr_4):
        psrldq  $4, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x0fff, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $12, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_4)

/*
 * ashr_5 handles the following cases:
 *      abs(str1 offset - str2 offset) = 11
 */
        .p2align 4
LABEL(ashr_5):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $11, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $5, %r9d        /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     5(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_5):
        add     $16, %r10
        jg      LABEL(nibble_ashr_5)

LABEL(gobble_ashr_5):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $5, %xmm3
        pslldq  $11, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_5)    /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $5, %xmm3
        pslldq  $11, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_5)

        .p2align 4
LABEL(nibble_ashr_5):
        psrldq  $5, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x07ff, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $11, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_5)

/*
 * ashr_6 handles the following cases:
 *      abs(str1 offset - str2 offset) = 10
 */
        .p2align 4
LABEL(ashr_6):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $10, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $6, %r9d        /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     6(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_6):
        add     $16, %r10
        jg      LABEL(nibble_ashr_6)

LABEL(gobble_ashr_6):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $6, %xmm3
        pslldq  $10, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_6)    /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $6, %xmm3
        pslldq  $10, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_6)

        .p2align 4
LABEL(nibble_ashr_6):
        psrldq  $6, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x03ff, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $10, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_6)

/*
 * ashr_7 handles the following cases:
 *      abs(str1 offset - str2 offset) = 9
 */
        .p2align 4
LABEL(ashr_7):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $9, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $7, %r9d        /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     7(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_7):
        add     $16, %r10
        jg      LABEL(nibble_ashr_7)

LABEL(gobble_ashr_7):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $7, %xmm3
        pslldq  $9, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_7)    /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $7, %xmm3
        pslldq  $9, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_7)

        .p2align 4
LABEL(nibble_ashr_7):
        psrldq  $7, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x01ff, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $9, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_7)

/*
 * ashr_8 handles the following cases:
 *      abs(str1 offset - str2 offset) = 8
 */
        .p2align 4
LABEL(ashr_8):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $8, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $8, %r9d        /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     8(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_8):
        add     $16, %r10
        jg      LABEL(nibble_ashr_8)

LABEL(gobble_ashr_8):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $8, %xmm3
        pslldq  $8, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_8)    /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $8, %xmm3
        pslldq  $8, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_8)

        .p2align 4
LABEL(nibble_ashr_8):
        psrldq  $8, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x00ff, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $8, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_8)

/*
 * ashr_9 handles the following cases:
 *      abs(str1 offset - str2 offset) = 7
 */
        .p2align 4
LABEL(ashr_9):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $7, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $9, %r9d        /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     9(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_9):
        add     $16, %r10
        jg      LABEL(nibble_ashr_9)

LABEL(gobble_ashr_9):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $9, %xmm3
        pslldq  $7, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_9)    /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $9, %xmm3
        pslldq  $7, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3            /* store for next cycle */
        jmp     LABEL(loop_ashr_9)

        .p2align 4
LABEL(nibble_ashr_9):
        psrldq  $9, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x007f, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $7, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_9)

/*
 * ashr_10 handles the following cases:
 *      abs(str1 offset - str2 offset) = 6
 */
        .p2align 4
LABEL(ashr_10):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $6, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $10, %r9d       /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     10(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_10):
        add     $16, %r10
        jg      LABEL(nibble_ashr_10)

LABEL(gobble_ashr_10):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $10, %xmm3
        pslldq  $6, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_10)   /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $10, %xmm3
        pslldq  $6, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_10)

        .p2align 4
LABEL(nibble_ashr_10):
        psrldq  $10, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x003f, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $6, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_10)

/*
 * ashr_11 handles the following cases:
 *      abs(str1 offset - str2 offset) = 5
 */
        .p2align 4
LABEL(ashr_11):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $5, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $11, %r9d       /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     11(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_11):
        add     $16, %r10
        jg      LABEL(nibble_ashr_11)

LABEL(gobble_ashr_11):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $11, %xmm3
        pslldq  $5, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_11)   /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $11, %xmm3
        pslldq  $5, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_11)

        .p2align 4
LABEL(nibble_ashr_11):
        psrldq  $11, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x001f, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $5, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_11)

/*
 * ashr_12 handles the following cases:
 *      abs(str1 offset - str2 offset) = 4
 */
        .p2align 4
LABEL(ashr_12):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $4, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $12, %r9d       /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     12(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_12):
        add     $16, %r10
        jg      LABEL(nibble_ashr_12)

LABEL(gobble_ashr_12):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $12, %xmm3
        pslldq  $4, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_12)   /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $12, %xmm3
        pslldq  $4, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_12)

        .p2align 4
LABEL(nibble_ashr_12):
        psrldq  $12, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x000f, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $4, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_12)

/*
 * ashr_13 handles the following cases:
 *      abs(str1 offset - str2 offset) = 3
 */
        .p2align 4
LABEL(ashr_13):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $3, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $13, %r9d       /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     13(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_13):
        add     $16, %r10
        jg      LABEL(nibble_ashr_13)

LABEL(gobble_ashr_13):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $13, %xmm3
        pslldq  $3, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_13)   /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $13, %xmm3
        pslldq  $3, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_13)

        .p2align 4
LABEL(nibble_ashr_13):
        psrldq  $13, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x0007, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $3, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_13)

/*
 * ashr_14 handles the following cases:
 *      abs(str1 offset - str2 offset) = 2
 */
        .p2align 4
LABEL(ashr_14):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $2, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)
        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $14, %r9d       /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     14(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_14):
        add     $16, %r10
        jg      LABEL(nibble_ashr_14)

LABEL(gobble_ashr_14):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $14, %xmm3
        pslldq  $2, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_14)   /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $14, %xmm3
        pslldq  $2, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_14)

        .p2align 4
LABEL(nibble_ashr_14):
        psrldq  $14, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x0003, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $2, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_14)

/*
 * ashr_15 handles the following cases:
 *      abs(str1 offset - str2 offset) = 1
 */
        .p2align 4
LABEL(ashr_15):
        pxor    %xmm0, %xmm0
        movdqa  (%rdi), %xmm2
        movdqa  (%rsi), %xmm1
        pcmpeqb %xmm1, %xmm0
        pslldq  $1, %xmm2
        pcmpeqb %xmm1, %xmm2
        psubb   %xmm0, %xmm2
        pmovmskb %xmm2, %r9d
        shr     %cl, %edx
        shr     %cl, %r9d
        sub     %r9d, %edx
        jnz     LABEL(less32bytes)

        movdqa  (%rdi), %xmm3

        UPDATE_STRNCMP_COUNTER

        pxor    %xmm0, %xmm0
        mov     $16, %rcx       /* index for loads */
        mov     $15, %r9d       /* rdi bytes already examined. Used in exit code */
        /*
         * Setup %r10 value allows us to detect crossing a page boundary.
         * When %r10 goes positive we are crossing a page boundary and
         * need to do a nibble.
         */
        lea     15(%rdi), %r10
        and     $0xfff, %r10    /* offset into 4K page */
        sub     $0x1000, %r10   /* subtract 4K pagesize */
        movdqa  %xmm3, %xmm4

        .p2align 4
LABEL(loop_ashr_15):
        add     $16, %r10
        jg      LABEL(nibble_ashr_15)

LABEL(gobble_ashr_15):
        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $15, %xmm3
        pslldq  $1, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3

        add     $16, %r10
        jg      LABEL(nibble_ashr_15)   /* cross page boundary */

        movdqa  (%rsi, %rcx), %xmm1
        movdqa  (%rdi, %rcx), %xmm2
        movdqa  %xmm2, %xmm4

        psrldq  $15, %xmm3
        pslldq  $1, %xmm2
        por     %xmm3, %xmm2

        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm2, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0xffff, %edx
        jnz     LABEL(exit)

#ifdef USE_AS_STRNCMP
        sub     $16, %r11
        jbe     LABEL(strcmp_exitz)
#endif

        add     $16, %rcx
        movdqa  %xmm4, %xmm3
        jmp     LABEL(loop_ashr_15)

        .p2align 4
LABEL(nibble_ashr_15):
        psrldq  $15, %xmm4
        movdqa  (%rsi, %rcx), %xmm1
        pcmpeqb %xmm1, %xmm0
        pcmpeqb %xmm4, %xmm1
        psubb   %xmm0, %xmm1
        pmovmskb %xmm1, %edx
        sub     $0x0001, %edx
        jnz     LABEL(exit)
#ifdef USE_AS_STRNCMP
        cmp     $1, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        pxor    %xmm0, %xmm0
        sub     $0x1000, %r10           /* subtract 4K from %r10 */
        jmp     LABEL(gobble_ashr_15)

        .p2align 4
LABEL(exit):
        lea     -16(%r9, %rcx), %rax    /* locate the exact offset for rdi */
LABEL(less32bytes):
        lea     (%rdi, %rax), %rdi      /* locate the exact address for first operand(rdi) */
        lea     (%rsi, %rcx), %rsi      /* locate the exact address for second operand(rsi) */
        test    %r8d, %r8d
        jz      LABEL(ret)
        xchg    %rsi, %rdi              /* recover original order according to flag(%r8d) */

        .p2align 4
LABEL(ret):
LABEL(less16bytes):
        /*
         * Check to see if BSF is fast on this processor. If not, use a different
         * exit tail.
         */
        testl   $USE_BSF,.memops_method(%rip)
        jz      LABEL(AMD_exit)
        bsf     %rdx, %rdx              /* find and store bit index in %rdx */

#ifdef USE_AS_STRNCMP
        sub     %rdx, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        xor     %ecx, %ecx              /* clear %ecx */
        xor     %eax, %eax              /* clear %eax */

        movb    (%rsi, %rdx), %cl
        movb    (%rdi, %rdx), %al

        sub     %ecx, %eax
        ret

#ifdef USE_AS_STRNCMP
LABEL(strcmp_exitz):
        xor     %eax, %eax
        ret
#endif

        /*
         * This exit tail does not use the bsf instruction.
         */
        .p2align 4
LABEL(AMD_exit):
        test    %dl, %dl
        jz      LABEL(next_8_bytes)

        test    $0x01, %dl
        jnz     LABEL(Byte0)

        test    $0x02, %dl
        jnz     LABEL(Byte1)

        test    $0x04, %dl
        jnz     LABEL(Byte2)

        test    $0x08, %dl
        jnz     LABEL(Byte3)

        test    $0x10, %dl
        jnz     LABEL(Byte4)

        test    $0x20, %dl
        jnz     LABEL(Byte5)

        test    $0x40, %dl
        jnz     LABEL(Byte6)

#ifdef USE_AS_STRNCMP
        sub     $7, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        movzx   7(%rsi), %ecx
        movzx   7(%rdi), %eax

        sub     %ecx, %eax
        ret

        .p2align 4
LABEL(Byte0):
        /*
         * never need to handle byte 0 for strncmpy
#ifdef USE_AS_STRNCMP
        sub     $0, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        */
        movzx   (%rsi), %ecx
        movzx   (%rdi), %eax

        sub     %ecx, %eax
        ret

        .p2align 4
LABEL(Byte1):

#ifdef USE_AS_STRNCMP
        sub     $1, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        movzx   1(%rsi), %ecx
        movzx   1(%rdi), %eax

        sub     %ecx, %eax
        ret

        .p2align 4
LABEL(Byte2):

#ifdef USE_AS_STRNCMP
        sub     $2, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        movzx   2(%rsi), %ecx
        movzx   2(%rdi), %eax

        sub     %ecx, %eax
        ret

        .p2align 4
LABEL(Byte3):

#ifdef USE_AS_STRNCMP
        sub     $3, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        movzx   3(%rsi), %ecx
        movzx   3(%rdi), %eax

        sub     %ecx, %eax
        ret

        .p2align 4
LABEL(Byte4):

#ifdef USE_AS_STRNCMP
        sub     $4, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        movzx   4(%rsi), %ecx
        movzx   4(%rdi), %eax

        sub     %ecx, %eax
        ret

        .p2align 4
LABEL(Byte5):

#ifdef USE_AS_STRNCMP
        sub     $5, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        movzx   5(%rsi), %ecx
        movzx   5(%rdi), %eax

        sub     %ecx, %eax
        ret

        .p2align 4
LABEL(Byte6):

#ifdef USE_AS_STRNCMP
        sub     $6, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        movzx   6(%rsi), %ecx
        movzx   6(%rdi), %eax

        sub     %ecx, %eax
        ret

        .p2align 4
LABEL(next_8_bytes):
        add     $8, %rdi
        add     $8, %rsi
#ifdef USE_AS_STRNCMP
        sub     $8, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        test    $0x01, %dh
        jnz     LABEL(Byte0)

        test    $0x02, %dh
        jnz     LABEL(Byte1)

        test    $0x04, %dh
        jnz     LABEL(Byte2)

        test    $0x08, %dh
        jnz     LABEL(Byte3)

        test    $0x10, %dh
        jnz     LABEL(Byte4)

        test    $0x20, %dh
        jnz     LABEL(Byte5)

        test    $0x40, %dh
        jnz     LABEL(Byte6)

#ifdef USE_AS_STRNCMP
        sub     $7, %r11
        jbe     LABEL(strcmp_exitz)
#endif
        movzx   7(%rsi), %ecx
        movzx   7(%rdi), %eax

        sub     %ecx, %eax
        ret

        .pushsection .rodata
        .p2align 4
LABEL(unaligned_table):
        .int    LABEL(ashr_0) - LABEL(unaligned_table)
        .int    LABEL(ashr_15) - LABEL(unaligned_table)
        .int    LABEL(ashr_14) - LABEL(unaligned_table)
        .int    LABEL(ashr_13) - LABEL(unaligned_table)
        .int    LABEL(ashr_12) - LABEL(unaligned_table)
        .int    LABEL(ashr_11) - LABEL(unaligned_table)
        .int    LABEL(ashr_10) - LABEL(unaligned_table)
        .int    LABEL(ashr_9) - LABEL(unaligned_table)
        .int    LABEL(ashr_8) - LABEL(unaligned_table)
        .int    LABEL(ashr_7) - LABEL(unaligned_table)
        .int    LABEL(ashr_6) - LABEL(unaligned_table)
        .int    LABEL(ashr_5) - LABEL(unaligned_table)
        .int    LABEL(ashr_4) - LABEL(unaligned_table)
        .int    LABEL(ashr_3) - LABEL(unaligned_table)
        .int    LABEL(ashr_2) - LABEL(unaligned_table)
        .int    LABEL(ashr_1) - LABEL(unaligned_table)
        .popsection
#ifdef USE_AS_STRNCMP
        SET_SIZE(strncmp)
#else
        SET_SIZE(strcmp)                /* (const char *, const char *) */
#endif