root/usr/src/lib/libmvec/common/vis/__vatan.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

        .file   "__vatan.S"

#include "libm.h"

        RO_DATA

! following is the C version of the ATAN algorithm
!  #include <math.h>
!  #include <stdio.h>
!  double jkatan(double *x)
!  {
!    double  f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy;
!    int index, sign, intf, intz;
!    extern const double __vlibm_TBL_atan1[];
!    long *pf = (long *) &f, *pz = (long *) &z;
!
!  /*    Power series  atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7
!   *    Error =  -3.08254E-18   On the interval  |x| < 1/64 */
!
!  /* define dummy names for readability.  Use parray to help compiler optimize loads */
!  #define p3    parray[0]
!  #define p2    parray[1]
!  #define p1    parray[2]
!  #define soffset      3
!
!    static const double parray[] = {
!     -1.428029046844299722E-01,                /* p[3]         */
!      1.999999917247000615E-01,                /* p[2]         */
!     -3.333333333329292858E-01,                /* p[1]         */
!      1.0,                             /* not used for p[0], though            */
!     -1.0,                             /* used to flip sign of answer          */
!    };
!
!    f        = *x;                             /* fetch argument               */
!    intf     = pf[0];                          /* grab upper half              */
!    sign     = intf & 0x80000000;                      /* sign of argument             */
!    intf    ^= sign;                           /* abs(upper argument)          */
!    sign     = (unsigned) sign >> 31;          /* sign bit = 0 or 1            */
!    pf[0]    = intf;
!
!    if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */
!    {
!      if(  (intf > 0x7ff00000) ||
!        ((intf == 0x7ff00000) &&  (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/
!      if( intf < 0x3e300000 )                  /* avoid underflow for small arg */
!      {
!        dummy = 1.0e37 + f;
!        dummy = dummy;
!        return (*x);
!      }
!      if( intf > 0x43600000 )                  /* avoid underflow for big arg  */
!      {
!        index = 2;
!        f     = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low   */
!        f     = parray[soffset + sign] * f;    /* put sign bit on ans          */
!        return (f);
!      }
!    }
!
!    index    = 0;                              /* points to 0,0 in table       */
!    if (intf > 0x40500000)                     /* if(|x| > 64                  */
!    { f = -1.0/f;
!      index  = 2;                              /* point to pi/2 upper, lower   */
!    }
!    else if( intf >= 0x3f900000 )              /* if |x| >= (1/64)...          */
!    {
!      intz   = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper       */
!      pz[0]  = intz;                           /* store as a double (z)        */
!      pz[1]  = 0;                              /* ...lower                     */
!      f      = (f - z)/(1.0 + f*z);            /* get reduced argument         */
!      index  = (intz - 0x3f900000) >> 15;      /* (index >> 16) << 1)          */
!      index += 4;                              /* skip over 0,0,pi/2,pi/2      */
!    }
!    conup    = __vlibm_TBL_atan1[index];       /* upper table                  */
!    conlo    = __vlibm_TBL_atan1[index+1];     /* lower table                  */
!    tmp      = f*f;
!    poly     = (f*tmp)*((p3*tmp + p2)*tmp + p1);
!    ansu     = conup + f;                      /* compute atan(f)  upper       */
!    ansl     = (((conup - ansu) + f) + poly) + conlo;
!    ans      = ansu + ansl;
!    ans      = parray[soffset + sign] * ans;
!    return ans;
!  }

/* 8 bytes = 1 double f.p. word */
#define WSIZE   8

        .align  32                      !align with full D-cache line
.COEFFS:
        .double 0r-1.428029046844299722E-01     !p[3]
        .double 0r1.999999917247000615E-01      !p[2]
        .double 0r-3.333333333329292858E-01     !p[1]
        .double 0r-1.0,                         !constant -1.0
        .word   0x00008000,0x0                  !for fp rounding of reduced arg
        .word   0x7fff0000,0x0                  !for fp truncation
        .word   0x47900000,0                    !a number close to 1.0E37
        .word   0x80000000,0x0                  !mask for fp sign bit
        .word   0x3f800000,0x0                  !1.0/128.0 dummy "safe" argument
        .type   .COEFFS,#object

        ENTRY(__vatan)
        save    %sp,-SA(MINFRAME)-16,%sp
        PIC_SETUP(g5)
        PIC_SET(g5,__vlibm_TBL_atan1,o4)
        PIC_SET(g5,.COEFFS,o0)
/*
   __vatan(int n, double *x, int stridex, double *y, stridey)
   computes y(i) = atan( x(i) ), for 1=1,n.  Stridex, stridey
   are the distance between x and y elements

        %i0    n
        %i1    address of x
        %i2    stride x
        %i3    address of y
        %i4    stride y
*/
        cmp     %i0,0                   !if n <=0,
        ble,pn  %icc,.RETURN            !....then do nothing
        sll     %i2,3,%i2               !convert stride to byte count
        sll     %i4,3,%i4               !convert stride to byte count

/*  pre-load constants before beginning main loop */

        ldd     [%o0],%f58              !load p[3]
        mov     2,%i5                   !argcount = 3

        ldd     [%o0+WSIZE],%f60        !load p[2]
        add     %fp,STACK_BIAS-8,%l1            !yaddr1 = &dummy
        fzero   %f18                    !ansu1 = 0

        ldd     [%o0+2*WSIZE],%f62      !load p[1]
        add     %fp,STACK_BIAS-8,%l2            !yaddr2 = &dummy
        fzero   %f12                    !(poly1) = 0

        ldd     [%o0+3*WSIZE],%f56      !-1.0
        fzero   %f14                    !tmp1 = 0

        ldd     [%o0+4*WSIZE],%f52      !load rounding mask
        fzero   %f16                    !conup1 = 0

        ldd     [%o0+5*WSIZE],%f54      !load truncation mask
        fzero   %f36                    !f1 = 0

        ldd     [%o0+6*WSIZE],%f50      !1.0e37
        fzero   %f38                    !f2 = 0

        ldd     [%o0+7*WSIZE],%f32      !mask for sign bit

        ldd     [%o4+2*WSIZE],%f46      !pi/2 upper
        ldd     [%o4+(2*WSIZE+8)],%f48  !pi/2 lower
        sethi   %hi(0x40500000),%l6     !64.0
        sethi   %hi(0x3f900000),%l7     !1/64.0
        mov     0,%l4                   !index1 = 0
        mov     0,%l5                   !index2 = 0

.MAINLOOP:

    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/

.LOOP0:
        deccc   %i0                     !--n
        bneg    1f
        mov     %i1,%o5                 !xuse = x (delay slot)

        ba      2f
        nop                             !delay slot
1:
        PIC_SET(g5,.COEFFS+8*WSIZE,o5)
        dec     %i5                     !argcount--
2:
        sethi   %hi(0x80000000),%o7     !mask for sign bit
/*2 */  sethi   %hi(0x43600000),%o1     !big = 0x43600000,0
        ld      [%o5],%o0               !intf = pf[0] = f upper
        ldd     [%o4+%l5],%f26          !conup2 = __vlibm_TBL_atan1[index2]

        sethi   %hi(0x3e300000),%o2     !small = 0x3e300000,0
/*4 */  andn    %o0,%o7,%o0             !intf = fabs(intf)
        ldd     [%o5],%f34              !f = *x into f34

        sub     %o1,%o0,%o1             !(-) if intf > big
/*6 */  sub     %o0,%o2,%o2             !(-) if intf < small
        fand    %f34,%f32,%f40          !sign0 = sign bit
            fmuld   %f38,%f38,%f24      !tmp2= f2*f2

/*7 */  orcc    %o1,%o2,%g0             !(-) if either true
        bneg,pn  %icc,.SPECIAL0         !if (-) goto special cases below
        fabsd   %f34,%f34               !abs(f)  (delay slot)
        !----------------------


        sethi   %hi(0x8000),%o7         !rounding bit
/*8 */  fpadd32 %f34,%f52,%f0           !intf + 0x00008000 (again)
            faddd   %f26,%f38,%f28      !ansu2 = conup2 + f2

        add     %o0,%o7,%o0             !intf + 0x00008000  (delay slot)
/*9*/   fand    %f0,%f54,%f0            !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
            fmuld   %f58,%f24,%f22      !p[3]*tmp2

/*10 */ sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
        fmuld   %f34,%f0,%f10           !f*z
        fsubd   %f34,%f0,%f20           !f - z
          add     %o4,%l4,%l4           !base addr + index1
          fmuld   %f14,%f12,%f12        !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1)
          faddd   %f16,%f36,%f16        !(conup1 - ansu1) + f1

/*12 */ and     %o0,%o7,%o0             !intz = (intf + 0x00008000) & 0x7fff0000
            faddd   %f22,%f60,%f22      !p[3]*tmp2 + p[2]
          ldd     [%l4+WSIZE],%f14      !conlo1 = __vlibm_TBL_atan1[index+1]

/*13 */ sub     %o0,%l7,%o2             !intz - 0x3f900000
        fsubd   %f10,%f56,%f10          !(f*z - (-1.0))
          faddd   %f16,%f12,%f12        !((conup1 - ansu1) + f1) + poly1

        cmp     %o0,%l6                 !(|f| > 64)
        ble     .ELSE0                  !if(|f| > 64) then
/*15 */ sra       %o2,15,%o3            !index  = (intz - 0x3f900000) >> 15
          mov     2,%o1                 !index == 2, point to conup, conlo = pi/2 upper, lower
          ba      .ENDIF0               !continue
/*16 */   fdivd   %f56,%f34,%f34        !f = -1.0/f (delay slot)
        .ELSE0:                         !else f( |x| >= (1/64))
        cmp     %o0,%l7                 !if intf >= 1/64
        bl      .ENDIF0                 !if( |x| >= (1/64) ) then...
        mov     0,%o1                   !index == 0 , point to conup,conlo = 0,0
          add     %o3,4,%o1             !index = index + 4
/*16 */   fdivd   %f20,%f10,%f34        !f = (f - z)/(1.0 + f*z), reduced argument
        .ENDIF0:

/*17*/  sll     %o1,3,%l3               !index0 = index
        mov     %i3,%l0                 !yaddr0 = address of y
          faddd   %f12,%f14,%f12        !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1
            fmuld   %f22,%f24,%f22      !(p3*tmp2 + p2)*tmp2
            fsubd   %f26,%f28,%f26      !conup2 - ansu2

/*20*/  add     %i1,%i2,%i1             !x     += stridex
        add     %i3,%i4,%i3             !y     += stridey
          faddd   %f18,%f12,%f36        !ans1 = ansu1 + ansl1
            fmuld   %f38,%f24,%f24      !f*tmp2
            faddd   %f22,%f62,%f22      !(p3*tmp2 + p2)*tmp2 + p1

/*23*/    for     %f36,%f42,%f36        !sign(ans1) = sign of argument
          std     %f36,[%l1]            !*yaddr1 = ans1
            add     %o4,%l5,%l5         !base addr + index2
            fmuld   %f24,%f22,%f22      !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1)
            faddd   %f26,%f38,%f26      !(conup2 - ansu2) + f2
        cmp     %i5,0                   !if argcount =0, we are done
        be      .RETURN
          nop

    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/

.LOOP1:
/*25*/  deccc   %i0                     !--n
        bneg    1f
        mov     %i1,%o5                 !xuse = x (delay slot)
        ba      2f
        nop                             !delay slot
1:
        PIC_SET(g5,.COEFFS+8*WSIZE,o5)
        dec     %i5                     !argcount--
2:

/*26*/  sethi   %hi(0x80000000),%o7     !mask for sign bit
        sethi   %hi(0x43600000),%o1     !big = 0x43600000,0
        ld      [%o5],%o0               !intf = pf[0] = f upper

/*28*/  sethi   %hi(0x3e300000),%o2     !small = 0x3e300000,0
        andn    %o0,%o7,%o0             !intf = fabs(intf)
        ldd     [%o5],%f36              !f = *x into f36

/*30*/  sub     %o1,%o0,%o1             !(-) if intf > big
        sub     %o0,%o2,%o2             !(-) if intf < small
        fand    %f36,%f32,%f42          !sign1 = sign bit

/*31*/  orcc    %o1,%o2,%g0             !(-) if either true
        bneg,pn  %icc,.SPECIAL1         !if (-) goto special cases below
        fabsd   %f36,%f36               !abs(f)  (delay slot)
        !----------------------

/*32*/  fpadd32 %f36,%f52,%f0           !intf + 0x00008000 (again)
          ldd     [%l5+WSIZE],%f24      !conlo2 = __vlibm_TBL_atan1[index2+1]

/*33*/  fand    %f0,%f54,%f0            !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
        sethi   %hi(0x8000),%o7         !rounding bit
          faddd   %f26,%f22,%f22        !((conup2 - ansu2) + f2) + poly2

/*34*/  add     %o0,%o7,%o0             !intf + 0x00008000  (delay slot)
        sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
        fmuld   %f36,%f0,%f10           !f*z
        fsubd   %f36,%f0,%f20           !f - z

/*35*/  and     %o0,%o7,%o0             !intz = (intf + 0x00008000) & 0x7fff0000
          faddd   %f22,%f24,%f22        !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2

/*37*/  sub     %o0,%l7,%o2             !intz - 0x3f900000
        fsubd   %f10,%f56,%f10          !(f*z - (-1.0))
      ldd     [%o4+%l3],%f6             !conup0 = __vlibm_TBL_atan1[index0]

        cmp     %o0,%l6                 !(|f| > 64)
        ble     .ELSE1                  !if(|f| > 64) then
/*38*/  sra     %o2,15,%o3              !index  = (intz - 0x3f900000) >> 15
          mov     2,%o1                 !index == 2, point to conup, conlo = pi/2 upper, lower
          ba      .ENDIF1               !continue
/*40*/    fdivd   %f56,%f36,%f36        !f = -1.0/f (delay slot)
        .ELSE1:                         !else f( |x| >= (1/64))
        cmp     %o0,%l7                 !if intf >= 1/64
        bl      .ENDIF1                 !if( |x| >= (1/64) ) then...
        mov     0,%o1                   !index == 0 , point to conup,conlo = 0,0
          add     %o3,4,%o1             !index = index + 4
/*40*/    fdivd   %f20,%f10,%f36        !f = (f - z)/(1.0 + f*z), reduced argument
        .ENDIF1:

/*41*/sll     %o1,3,%l4                 !index1 = index
      mov     %i3,%l1                   !yaddr1 = address of y
      fmuld   %f34,%f34,%f4             !tmp0= f0*f0
          faddd   %f28,%f22,%f38        !ans2 = ansu2 + ansl2

/*44*/add       %i1,%i2,%i1             !x     += stridex
      add       %i3,%i4,%i3             !y     += stridey
    fmuld   %f58,%f4,%f2                !p[3]*tmp0
    faddd   %f6,%f34,%f8                !ansu0 = conup0 + f0
          for     %f38,%f44,%f38        !sign(ans2) = sign of argument
          std     %f38,[%l2]            !*yaddr2 = ans2
          cmp   %i5,0                   !if argcount =0, we are done
          be    .RETURN
          nop

    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/

.LOOP2:
/*46*/  deccc   %i0                     !--n
        bneg    1f
        mov     %i1,%o5                 !xuse = x (delay slot)
        ba      2f
        nop                             !delay slot
1:
        PIC_SET(g5,.COEFFS+8*WSIZE,o5)
        dec     %i5                     !argcount--
2:

/*47*/  sethi   %hi(0x80000000),%o7     !mask for sign bit
        sethi   %hi(0x43600000),%o1     !big = 0x43600000,0
        ld      [%o5],%o0               !intf = pf[0] = f upper

/*49*/  sethi   %hi(0x3e300000),%o2     !small = 0x3e300000,0
        andn    %o0,%o7,%o0             !intf = fabs(intf)
        ldd     [%o5],%f38              !f = *x into f38

/*51*/  sub     %o1,%o0,%o1             !(-) if intf > big
        sub     %o0,%o2,%o2             !(-) if intf < small
        fand    %f38,%f32,%f44          !sign2 = sign bit

/*52*/  orcc    %o1,%o2,%g0             !(-) if either true
        bneg,pn  %icc,.SPECIAL2         !if (-) goto special cases below
        fabsd   %f38,%f38               !abs(f)  (delay slot)
        !----------------------

/*53*/  fpadd32 %f38,%f52,%f0           !intf + 0x00008000 (again)
      faddd   %f2,%f60,%f2              !p[3]*tmp0 + p[2]

/*54*/  sethi   %hi(0x8000),%o7         !rounding bit
        fand    %f0,%f54,%f0            !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)

/*55*/  add     %o0,%o7,%o0             !intf + 0x00008000  (delay slot)
        sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
        fmuld   %f38,%f0,%f10           !f*z
        fsubd   %f38,%f0,%f20           !f - z

/*56*/  and     %o0,%o7,%o0             !intz = (intf + 0x00008000) & 0x7fff0000
      fmuld   %f2,%f4,%f2               !(p3*tmp0 + p2)*tmp0
      fsubd   %f6,%f8,%f6               !conup0 - ansu0

/*58*/  sub     %o0,%l7,%o2             !intz - 0x3f900000
        fsubd   %f10,%f56,%f10          !(f*z - (-1.0))
            ldd     [%o4+%l4],%f16      !conup1 = __vlibm_TBL_atan1[index1]

        cmp     %o0,%l6                 !(|f| > 64)
        ble     .ELSE2                  !if(|f| > 64) then
/*60*/  sra     %o2,15,%o3              !index  = (intz - 0x3f900000) >> 15
          mov     2,%o1                 !index == 2, point to conup, conlo = pi/2 upper, lower
          ba      .ENDIF2               !continue
/*61*/    fdivd   %f56,%f38,%f38        !f = -1.0/f (delay slot)
        .ELSE2:                         !else f( |x| >= (1/64))
        cmp     %o0,%l7                 !if intf >= 1/64
        bl      .ENDIF2                 !if( |x| >= (1/64) ) then...
        mov     0,%o1                   !index == 0 , point to conup,conlo = 0,0
          add     %o3,4,%o1             !index = index + 4
/*61*/    fdivd   %f20,%f10,%f38        !f = (f - z)/(1.0 + f*z), reduced argument
        .ENDIF2:


/*62*/  sll     %o1,3,%l5               !index2 = index
        mov     %i3,%l2                 !yaddr2 = address of y
      fmuld   %f34,%f4,%f4              !f0*tmp0
      faddd   %f2,%f62,%f2              !(p3*tmp0 + p2)*tmp0 + p1
            fmuld   %f36,%f36,%f14      !tmp1= f1*f1

/*65*/add     %o4,%l3,%l3               !base addr + index0
      fmuld   %f4,%f2,%f2               !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1)
      faddd   %f6,%f34,%f6              !(conup0 - ansu0) + f0
            fmuld   %f58,%f14,%f12      !p[3]*tmp1
            faddd   %f16,%f36,%f18      !ansu1 = conup1 + f1
      ldd     [%l3+WSIZE],%f4           !conlo0 = __vlibm_TBL_atan1[index0+1]

/*68*/  add     %i1,%i2,%i1             !x     += stridex
        add     %i3,%i4,%i3             !y     += stridey
      faddd   %f6,%f2,%f2               !((conup0 - ansu0) + f0) + poly0
            faddd   %f12,%f60,%f12      !p[3]*tmp1 + p[2]

/*71*/faddd   %f2,%f4,%f2               !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0
            fmuld   %f12,%f14,%f12      !(p3*tmp1 + p2)*tmp1
            fsubd   %f16,%f18,%f16      !conup1 - ansu1

/*74*/faddd   %f8,%f2,%f34              !ans0 = ansu0 + ansl0
          fmuld   %f36,%f14,%f14        !f1*tmp1
          faddd   %f12,%f62,%f12        !(p3*tmp1 + p2)*tmp1 + p1

/*77*/  for     %f34,%f40,%f34          !sign(ans0) = sign of argument
        std     %f34,[%l0]              !*yaddr0 = ans, always gets stored (delay slot)
        cmp     %i5,0                   !if argcount =0, we are done
        bg      .MAINLOOP
          nop

    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/

.RETURN:
        ret
        restore %g0,%g0,%g0

    /*--------------------------------------------------------------------------*/
    /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/
    /*--------------------------------------------------------------------------*/

/* at this point
   %i1     x address
   %o0     intf
   %o2     intf - 0x3e300000
   %f34,36,38     f0,f1,f2
   %f40,42,44     sign0,sign1,sign2
*/

        .align  32                              !align on I-cache boundary
.SPECIAL0:
        orcc    %o2,%g0,%g0                     !(-) if intf < 0x3e300000
        bpos    1f                              !if >=...continue
        sethi   %hi(0x7ff00000),%g1     !upper word of Inf (we use 64-bit wide int for this)
          ba      3f
          faddd   %f34,%f50,%f30                !dummy op just to generate exception (delay slot)
1:
        ld      [%o5+4],%o5                     !load x lower word
        sllx    %o0,32,%o0                      !left justify intf
        sllx    %g1,32,%g1              !left justify Inf
        or      %o0,%o5,%o0                     !merge in lower intf
        cmp     %o0,%g1                         !if intf > 0x7ff00000 00000000
        ble,pt  %xcc,2f                         !pass thru if NaN
          nop
          fmuld   %f34,%f34,%f34                !...... (x*x) trigger invalid exception
          ba      3f
          nop
2:
        faddd   %f46,%f48,%f34                  !ans = pi/2 upper + pi/2 lower
3:
        add     %i1,%i2,%i1                     !x += stridex
        for     %f34,%f40,%f34                  !sign(ans) = sign of argument
        std     %f34,[%i3]                      !*y = ans
        ba      .LOOP0                          !keep looping
        add     %i3,%i4,%i3                     !y += stridey (delay slot)

    /*--------------------------------------------------------------------------*/
    /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/
    /*--------------------------------------------------------------------------*/

        .align  32                              !align on I-cache boundary
.SPECIAL1:
        orcc    %o2,%g0,%g0                     !(-) if intf < 0x3e300000
        bpos    1f                              !if >=...continue
        sethi   %hi(0x7ff00000),%g1     !upper word of Inf (we use 64-bit wide int for this)
          ba      3f
          faddd   %f36,%f50,%f30                !dummy op just to generate exception (delay slot)
1:
        ld      [%o5+4],%o5                     !load x lower word
        sllx    %o0,32,%o0                      !left justify intf
        sllx    %g1,32,%g1              !left justify Inf
        or      %o0,%o5,%o0                     !merge in lower intf
        cmp     %o0,%g1                         !if intf > 0x7ff00000 00000000
        ble,pt  %xcc,2f                         !pass thru if NaN
          nop
          fmuld   %f36,%f36,%f36                !...... (x*x) trigger invalid exception
          ba      3f
          nop
2:
        faddd   %f46,%f48,%f36                  !ans = pi/2 upper + pi/2 lower
3:
        add     %i1,%i2,%i1                     !x += stridex
        for     %f36,%f42,%f36                  !sign(ans) = sign of argument
        std     %f36,[%i3]                      !*y = ans
        ba      .LOOP1                          !keep looping
        add     %i3,%i4,%i3                     !y += stridey (delay slot)

    /*--------------------------------------------------------------------------*/
    /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/
    /*--------------------------------------------------------------------------*/

        .align  32                              !align on I-cache boundary
.SPECIAL2:
        orcc    %o2,%g0,%g0                     !(-) if intf < 0x3e300000
        bpos    1f                              !if >=...continue
        sethi   %hi(0x7ff00000),%g1     !upper word of Inf (we use 64-bit wide int for this)
          ba      3f
          faddd   %f38,%f50,%f30                !dummy op just to generate exception (delay slot)
1:
        ld      [%o5+4],%o5                     !load x lower word
        sllx    %o0,32,%o0                      !left justify intf
        sllx    %g1,32,%g1              !left justify Inf
        or      %o0,%o5,%o0                     !merge in lower intf
        cmp     %o0,%g1                         !if intf > 0x7ff00000 00000000
        ble,pt  %xcc,2f                         !pass thru if NaN
          nop
          fmuld   %f38,%f38,%f38                !...... (x*x) trigger invalid exception
          ba      3f
          nop
2:
        faddd   %f46,%f48,%f38                  !ans = pi/2 upper + pi/2 lower
3:
        add     %i1,%i2,%i1                     !x += stridex
        for     %f38,%f44,%f38                  !sign(ans) = sign of argument
        std     %f38,[%i3]                      !*y = ans
        ba      .LOOP2                          !keep looping
        add     %i3,%i4,%i3                     !y += stridey

    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/
    /*--------------------------------------------------------------------------*/

        SET_SIZE(__vatan)

!       .ident  "03-20-96 Sparc V9 3-way-unrolled version"