root/usr/src/lib/libc/capabilities/sun4u-us3/common/memcmp.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
 */

        .file   "memcmp.s"

/*
 * memcmp(s1, s2, len)
 *
 * Compare n bytes:  s1>s2: >0  s1==s2: 0  s1<s2: <0
 *
 * Fast assembler language version of the following C-program for memcmp
 * which represents the `standard' for the C-library.
 *
 *      int
 *      memcmp(const void *s1, const void *s2, size_t n)
 *      {
 *              if (s1 != s2 && n != 0) {
 *                      const char *ps1 = s1;
 *                      const char *ps2 = s2;
 *                      do {
 *                              if (*ps1++ != *ps2++)
 *                                      return(ps1[-1] - ps2[-1]);
 *                      } while (--n != 0);
 *              }
 *              return (0);
 *      }
 */

#include <sys/asm_linkage.h>
#include <sys/machasi.h>

#define BLOCK_SIZE      64

        ANSI_PRAGMA_WEAK(memcmp,function)

        ENTRY(memcmp)
        cmp     %o0, %o1                ! s1 == s2?
        be      %ncc, .cmpeq
        prefetch [%o0], #one_read
        prefetch [%o1], #one_read

        ! for small counts byte compare immediately
        cmp     %o2, 48
        bleu,a  %ncc, .bytcmp
        mov     %o2, %o3                ! o3 <= 48

        ! Count > 48. We will byte compare (8 + num of bytes to dbl align)
        ! bytes. We assume that most miscompares will occur in the 1st 8 bytes

        prefetch [%o0 + (1 * BLOCK_SIZE)], #one_read
        prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read

.chkdbl:
        and     %o0, 7, %o4             ! is s1 aligned on a 8 byte bound
        mov     8, %o3                  ! o2 > 48;  o3 = 8
        sub     %o4, 8, %o4             ! o4 = -(num of bytes to dbl align)
        ba      %ncc, .bytcmp
        sub     %o3, %o4, %o3           ! o3 = 8 + (num of bytes to dbl align)

1:      ldub    [%o1], %o5              ! byte compare loop
        inc     %o1
        inc     %o0
        dec     %o2
        cmp     %o4, %o5
        bne     %ncc, .noteq
.bytcmp:
        deccc   %o3
        bgeu,a  %ncc, 1b
        ldub    [%o0], %o4

        ! Check to see if there are more bytes to compare
        cmp     %o2, 0                  ! is o2 > 0
        bgu     %ncc, .dwcmp            ! we should already be dbl aligned
        nop
.cmpeq:
        retl                             ! strings compare equal
        sub     %g0, %g0, %o0

.noteq:
        retl                            ! strings aren't equal
        sub     %o4, %o5, %o0           ! return(*s1 - *s2)


        ! double word compare - using ldd and faligndata. Compares upto
        ! 8 byte multiple count and does byte compare for the residual.

.dwcmp:
        prefetch [%o0 + (2 * BLOCK_SIZE)], #one_read
        prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read

        ! if fprs.fef == 0, set it. Checking it, reqires 2 instructions.
        ! So set it anyway, without checking.
        rd      %fprs, %o3              ! o3 = fprs
        wr      %g0, 0x4, %fprs         ! fprs.fef = 1

        andn    %o2, 7, %o4             ! o4 has 8 byte aligned cnt
        sub     %o4, 8, %o4
        alignaddr %o1, %g0, %g1
        ldd     [%g1], %d0
4:
        add     %g1, 8, %g1
        ldd     [%g1], %d2
        ldd     [%o0], %d6
        prefetch [%g1 + (3 * BLOCK_SIZE)], #one_read
        prefetch [%o0 + (3 * BLOCK_SIZE)], #one_read
        faligndata %d0, %d2, %d8
        fcmpne32 %d6, %d8, %o5
        fsrc1   %d6, %d6                ! 2 fsrc1's added since o5 cannot
        fsrc1   %d8, %d8                ! be used for 3 cycles else we
        fmovd   %d2, %d0                ! create 9 bubbles in the pipeline
        brnz,a,pn %o5, 6f
        sub     %o1, %o0, %o1           ! o1 gets the difference
        subcc   %o4, 8, %o4
        add     %o0, 8, %o0
        add     %o1, 8, %o1
        bgu,pt  %ncc, 4b
        sub     %o2, 8, %o2

.residcmp:
        ba      6f
        sub     %o1, %o0, %o1           ! o1 gets the difference

5:      ldub    [%o0 + %o1], %o5        ! byte compare loop
        inc     %o0
        cmp     %o4, %o5
        bne     %ncc, .dnoteq
6:
        deccc   %o2
        bgeu,a  %ncc, 5b
        ldub    [%o0], %o4

        and     %o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
        wr      %o3, %g0, %fprs         ! fprs = o3 - restore fprs
        retl
        sub     %g0, %g0, %o0           ! strings compare equal

.dnoteq:
        and     %o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
        wr      %o3, %g0, %fprs         ! fprs = o3 - restore fprs
        retl
        sub     %o4, %o5, %o0           ! return(*s1 - *s2)

        SET_SIZE(memcmp)