root/usr/src/lib/libc/amd64/gen/strlen.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, Intel Corporation
 * All rights reserved.
 */

/*
 *      strlen - calculate the length of string
 */

#include "SYS.h"
#include "proc64_id.h"

#define LABEL(s) .strlen##s

        /*
         * This implementation uses SSE instructions to compare up to 16 bytes
         * at a time looking for the end of string (null char).
         */
        ENTRY(strlen)                   /* (const char *s) */
        mov     %rdi, %rsi              /* keep original %rdi value */
        mov     %rsi, %rcx
        pxor    %xmm0, %xmm0            /* 16 null chars */
        and     $15, %rcx
        jz      LABEL(align16_loop)     /* string is 16 byte aligned */

        /*
         * Unaligned case. Round down to 16-byte boundary before comparing
         * 16 bytes for a null char. The code then compensates for any extra chars
         * preceding the start of the string.
         */
LABEL(unalign16):
        and     $0xfffffffffffffff0, %rsi

        pcmpeqb (%rsi), %xmm0
        lea     16(%rdi), %rsi
        pmovmskb %xmm0, %edx

        shr     %cl, %edx               /* Compensate for bytes preceding the string */
        test    %edx, %edx
        jnz     LABEL(exit)
        sub     %rcx, %rsi              /* no null, adjust to next 16-byte boundary */
        pxor    %xmm0, %xmm0            /* clear xmm0, may have been changed... */

        .p2align 4
LABEL(align16_loop):                    /* 16 byte aligned */
        pcmpeqb (%rsi), %xmm0           /* look for null bytes */
        pmovmskb %xmm0, %edx            /* move each byte mask of %xmm0 to edx */

        add     $16, %rsi               /* prepare to search next 16 bytes */
        test    %edx, %edx              /* if no null byte, %edx must be 0 */
        jnz     LABEL(exit)             /* found a null */

        pcmpeqb (%rsi), %xmm0
        pmovmskb %xmm0, %edx
        add     $16, %rsi
        test    %edx, %edx
        jnz     LABEL(exit)

        pcmpeqb (%rsi), %xmm0
        pmovmskb %xmm0, %edx
        add     $16, %rsi
        test    %edx, %edx
        jnz     LABEL(exit)

        pcmpeqb (%rsi), %xmm0
        pmovmskb %xmm0, %edx
        add     $16, %rsi
        test    %edx, %edx
        jz      LABEL(align16_loop)

        .p2align 4
LABEL(exit):
        neg     %rdi
        /*
         * Check to see if BSF is fast on this processor. If not, use a different
         * exit tail to find first bit set indicating null byte match.
         */
        testl   $USE_BSF, .memops_method(%rip)
        jz      LABEL(AMD_exit)

        lea     -16(%rdi, %rsi), %rax   /* calculate exact offset */
        bsf     %edx, %ecx              /* Least significant 1 bit is index of null */
        lea     (%rax, %rcx),%rax
        ret

        /*
         * This exit tail does not use the bsf instruction.
         */
        .p2align 4
LABEL(AMD_exit):
        lea     -16(%rdi, %rsi), %rax
        test    %dl, %dl
        jz      LABEL(exit_high)
        test    $0x01, %dl
        jnz     LABEL(exit_tail0)

        test    $0x02, %dl
        jnz     LABEL(exit_tail1)

        .p2align 4
        test    $0x04, %dl
        jnz     LABEL(exit_tail2)

        test    $0x08, %dl
        jnz     LABEL(exit_tail3)

        test    $0x10, %dl
        jnz     LABEL(exit_tail4)

        test    $0x20, %dl
        jnz     LABEL(exit_tail5)

        test    $0x40, %dl
        jnz     LABEL(exit_tail6)
        add     $7, %rax
        ret

        .p2align 4
LABEL(exit_high):
        add     $8, %rax
        test    $0x01, %dh
        jnz     LABEL(exit_tail0)

        test    $0x02, %dh
        jnz     LABEL(exit_tail1)

        test    $0x04, %dh
        jnz     LABEL(exit_tail2)

        test    $0x08, %dh
        jnz     LABEL(exit_tail3)

        test    $0x10, %dh
        jnz     LABEL(exit_tail4)

        test    $0x20, %dh
        jnz     LABEL(exit_tail5)

        test    $0x40, %dh
        jnz     LABEL(exit_tail6)
        add     $7, %rax
        ret

        .p2align 4
LABEL(exit_tail0):
        xor     %ecx, %ecx
        ret

        .p2align 4
LABEL(exit_tail1):
        add     $1, %rax
        ret

        .p2align 4
LABEL(exit_tail2):
        add     $2, %rax
        ret

        .p2align 4
LABEL(exit_tail3):
        add     $3, %rax
        ret

        .p2align 4
LABEL(exit_tail4):
        add     $4, %rax
        ret

        .p2align 4
LABEL(exit_tail5):
        add     $5, %rax
        ret

        .p2align 4
LABEL(exit_tail6):
        add     $6, %rax
        ret
        SET_SIZE(strlen)