.file "__vatan.S"
#include "libm.h"
RO_DATA
! following is the C version of the ATAN algorithm
! #include <math.h>
! #include <stdio.h>
! double jkatan(double *x)
! {
! double f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy;
! int index, sign, intf, intz;
! extern const double __vlibm_TBL_atan1[];
! long *pf = (long *) &f, *pz = (long *) &z;
!
!
!
!
! #define p3 parray[0]
! #define p2 parray[1]
! #define p1 parray[2]
! #define soffset 3
!
! static const double parray[] = {
! -1.428029046844299722E-01,
! 1.999999917247000615E-01,
! -3.333333333329292858E-01,
! 1.0,
! -1.0,
! };
!
! f = *x;
! intf = pf[0];
! sign = intf & 0x80000000;
! intf ^= sign;
! sign = (unsigned) sign >> 31;
! pf[0] = intf;
!
! if( (intf > 0x43600000) || (intf < 0x3e300000) )
! {
! if( (intf > 0x7ff00000) ||
! ((intf == 0x7ff00000) && (pf[1] !=0)) ) return (*x-*x);
! if( intf < 0x3e300000 )
! {
! dummy = 1.0e37 + f;
! dummy = dummy;
! return (*x);
! }
! if( intf > 0x43600000 )
! {
! index = 2;
! f = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];
! f = parray[soffset + sign] * f;
! return (f);
! }
! }
!
! index = 0;
! if (intf > 0x40500000)
! { f = -1.0/f;
! index = 2;
! }
! else if( intf >= 0x3f900000 )
! {
! intz = (intf + 0x00008000) & 0x7fff0000;
! pz[0] = intz;
! pz[1] = 0;
! f = (f - z)/(1.0 + f*z);
! index = (intz - 0x3f900000) >> 15;
! index += 4;
! }
! conup = __vlibm_TBL_atan1[index];
! conlo = __vlibm_TBL_atan1[index+1];
! tmp = f*f;
! poly = (f*tmp)*((p3*tmp + p2)*tmp + p1);
! ansu = conup + f;
! ansl = (((conup - ansu) + f) + poly) + conlo;
! ans = ansu + ansl;
! ans = parray[soffset + sign] * ans;
! return ans;
! }
#define WSIZE 8
.align 32 !align with full D-cache line
.COEFFS:
.double 0r-1.428029046844299722E-01 !p[3]
.double 0r1.999999917247000615E-01 !p[2]
.double 0r-3.333333333329292858E-01 !p[1]
.double 0r-1.0, !constant -1.0
.word 0x00008000,0x0 !for fp rounding of reduced arg
.word 0x7fff0000,0x0 !for fp truncation
.word 0x47900000,0 !a number close to 1.0E37
.word 0x80000000,0x0 !mask for fp sign bit
.word 0x3f800000,0x0 !1.0/128.0 dummy "safe" argument
.type .COEFFS,#object
ENTRY(__vatan)
save %sp,-SA(MINFRAME)-16,%sp
PIC_SETUP(g5)
PIC_SET(g5,__vlibm_TBL_atan1,o4)
PIC_SET(g5,.COEFFS,o0)
cmp %i0,0 !if n <=0,
ble,pn %icc,.RETURN !....then do nothing
sll %i2,3,%i2 !convert stride to byte count
sll %i4,3,%i4 !convert stride to byte count
ldd [%o0],%f58 !load p[3]
mov 2,%i5 !argcount = 3
ldd [%o0+WSIZE],%f60 !load p[2]
add %fp,STACK_BIAS-8,%l1 !yaddr1 = &dummy
fzero %f18 !ansu1 = 0
ldd [%o0+2*WSIZE],%f62 !load p[1]
add %fp,STACK_BIAS-8,%l2 !yaddr2 = &dummy
fzero %f12 !(poly1) = 0
ldd [%o0+3*WSIZE],%f56 !-1.0
fzero %f14 !tmp1 = 0
ldd [%o0+4*WSIZE],%f52 !load rounding mask
fzero %f16 !conup1 = 0
ldd [%o0+5*WSIZE],%f54 !load truncation mask
fzero %f36 !f1 = 0
ldd [%o0+6*WSIZE],%f50 !1.0e37
fzero %f38 !f2 = 0
ldd [%o0+7*WSIZE],%f32 !mask for sign bit
ldd [%o4+2*WSIZE],%f46 !pi/2 upper
ldd [%o4+(2*WSIZE+8)],%f48 !pi/2 lower
sethi %hi(0x40500000),%l6 !64.0
sethi %hi(0x3f900000),%l7 !1/64.0
mov 0,%l4 !index1 = 0
mov 0,%l5 !index2 = 0
.MAINLOOP:
.LOOP0:
deccc %i0 !--n
bneg 1f
mov %i1,%o5 !xuse = x (delay slot)
ba 2f
nop !delay slot
1:
PIC_SET(g5,.COEFFS+8*WSIZE,o5)
dec %i5 !argcount--
2:
sethi %hi(0x80000000),%o7 !mask for sign bit
sethi %hi(0x43600000),%o1 !big = 0x43600000,0
ld [%o5],%o0 !intf = pf[0] = f upper
ldd [%o4+%l5],%f26 !conup2 = __vlibm_TBL_atan1[index2]
sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0
andn %o0,%o7,%o0 !intf = fabs(intf)
ldd [%o5],%f34 !f = *x into f34
sub %o1,%o0,%o1 !(-) if intf > big
sub %o0,%o2,%o2 !(-) if intf < small
fand %f34,%f32,%f40 !sign0 = sign bit
fmuld %f38,%f38,%f24 !tmp2= f2*f2
orcc %o1,%o2,%g0 !(-) if either true
bneg,pn %icc,.SPECIAL0 !if (-) goto special cases below
fabsd %f34,%f34 !abs(f) (delay slot)
!----------------------
sethi %hi(0x8000),%o7 !rounding bit
fpadd32 %f34,%f52,%f0 !intf + 0x00008000 (again)
faddd %f26,%f38,%f28 !ansu2 = conup2 + f2
add %o0,%o7,%o0 !intf + 0x00008000 (delay slot)
fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
fmuld %f58,%f24,%f22 !p[3]*tmp2
sethi %hi(0x7fff0000),%o7 !mask for rounding argument
fmuld %f34,%f0,%f10 !f*z
fsubd %f34,%f0,%f20 !f - z
add %o4,%l4,%l4 !base addr + index1
fmuld %f14,%f12,%f12 !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1)
faddd %f16,%f36,%f16 !(conup1 - ansu1) + f1
and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000
faddd %f22,%f60,%f22 !p[3]*tmp2 + p[2]
ldd [%l4+WSIZE],%f14 !conlo1 = __vlibm_TBL_atan1[index+1]
sub %o0,%l7,%o2 !intz - 0x3f900000
fsubd %f10,%f56,%f10 !(f*z - (-1.0))
faddd %f16,%f12,%f12 !((conup1 - ansu1) + f1) + poly1
cmp %o0,%l6 !(|f| > 64)
ble .ELSE0 !if(|f| > 64) then
sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15
mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower
ba .ENDIF0 !continue
fdivd %f56,%f34,%f34 !f = -1.0/f (delay slot)
.ELSE0: !else f( |x| >= (1/64))
cmp %o0,%l7 !if intf >= 1/64
bl .ENDIF0 !if( |x| >= (1/64) ) then...
mov 0,%o1 !index == 0 , point to conup,conlo = 0,0
add %o3,4,%o1 !index = index + 4
fdivd %f20,%f10,%f34 !f = (f - z)/(1.0 + f*z), reduced argument
.ENDIF0:
sll %o1,3,%l3 !index0 = index
mov %i3,%l0 !yaddr0 = address of y
faddd %f12,%f14,%f12 !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1
fmuld %f22,%f24,%f22 !(p3*tmp2 + p2)*tmp2
fsubd %f26,%f28,%f26 !conup2 - ansu2
add %i1,%i2,%i1 !x += stridex
add %i3,%i4,%i3 !y += stridey
faddd %f18,%f12,%f36 !ans1 = ansu1 + ansl1
fmuld %f38,%f24,%f24 !f*tmp2
faddd %f22,%f62,%f22 !(p3*tmp2 + p2)*tmp2 + p1
for %f36,%f42,%f36 !sign(ans1) = sign of argument
std %f36,[%l1] !*yaddr1 = ans1
add %o4,%l5,%l5 !base addr + index2
fmuld %f24,%f22,%f22 !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1)
faddd %f26,%f38,%f26 !(conup2 - ansu2) + f2
cmp %i5,0 !if argcount =0, we are done
be .RETURN
nop
.LOOP1:
deccc %i0 !--n
bneg 1f
mov %i1,%o5 !xuse = x (delay slot)
ba 2f
nop !delay slot
1:
PIC_SET(g5,.COEFFS+8*WSIZE,o5)
dec %i5 !argcount--
2:
sethi %hi(0x80000000),%o7 !mask for sign bit
sethi %hi(0x43600000),%o1 !big = 0x43600000,0
ld [%o5],%o0 !intf = pf[0] = f upper
sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0
andn %o0,%o7,%o0 !intf = fabs(intf)
ldd [%o5],%f36 !f = *x into f36
sub %o1,%o0,%o1 !(-) if intf > big
sub %o0,%o2,%o2 !(-) if intf < small
fand %f36,%f32,%f42 !sign1 = sign bit
orcc %o1,%o2,%g0 !(-) if either true
bneg,pn %icc,.SPECIAL1 !if (-) goto special cases below
fabsd %f36,%f36 !abs(f) (delay slot)
!----------------------
fpadd32 %f36,%f52,%f0 !intf + 0x00008000 (again)
ldd [%l5+WSIZE],%f24 !conlo2 = __vlibm_TBL_atan1[index2+1]
fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
sethi %hi(0x8000),%o7 !rounding bit
faddd %f26,%f22,%f22 !((conup2 - ansu2) + f2) + poly2
add %o0,%o7,%o0 !intf + 0x00008000 (delay slot)
sethi %hi(0x7fff0000),%o7 !mask for rounding argument
fmuld %f36,%f0,%f10 !f*z
fsubd %f36,%f0,%f20 !f - z
and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000
faddd %f22,%f24,%f22 !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2
sub %o0,%l7,%o2 !intz - 0x3f900000
fsubd %f10,%f56,%f10 !(f*z - (-1.0))
ldd [%o4+%l3],%f6 !conup0 = __vlibm_TBL_atan1[index0]
cmp %o0,%l6 !(|f| > 64)
ble .ELSE1 !if(|f| > 64) then
sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15
mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower
ba .ENDIF1 !continue
fdivd %f56,%f36,%f36 !f = -1.0/f (delay slot)
.ELSE1: !else f( |x| >= (1/64))
cmp %o0,%l7 !if intf >= 1/64
bl .ENDIF1 !if( |x| >= (1/64) ) then...
mov 0,%o1 !index == 0 , point to conup,conlo = 0,0
add %o3,4,%o1 !index = index + 4
fdivd %f20,%f10,%f36 !f = (f - z)/(1.0 + f*z), reduced argument
.ENDIF1:
sll %o1,3,%l4 !index1 = index
mov %i3,%l1 !yaddr1 = address of y
fmuld %f34,%f34,%f4 !tmp0= f0*f0
faddd %f28,%f22,%f38 !ans2 = ansu2 + ansl2
add %i1,%i2,%i1 !x += stridex
add %i3,%i4,%i3 !y += stridey
fmuld %f58,%f4,%f2 !p[3]*tmp0
faddd %f6,%f34,%f8 !ansu0 = conup0 + f0
for %f38,%f44,%f38 !sign(ans2) = sign of argument
std %f38,[%l2] !*yaddr2 = ans2
cmp %i5,0 !if argcount =0, we are done
be .RETURN
nop
.LOOP2:
deccc %i0 !--n
bneg 1f
mov %i1,%o5 !xuse = x (delay slot)
ba 2f
nop !delay slot
1:
PIC_SET(g5,.COEFFS+8*WSIZE,o5)
dec %i5 !argcount--
2:
sethi %hi(0x80000000),%o7 !mask for sign bit
sethi %hi(0x43600000),%o1 !big = 0x43600000,0
ld [%o5],%o0 !intf = pf[0] = f upper
sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0
andn %o0,%o7,%o0 !intf = fabs(intf)
ldd [%o5],%f38 !f = *x into f38
sub %o1,%o0,%o1 !(-) if intf > big
sub %o0,%o2,%o2 !(-) if intf < small
fand %f38,%f32,%f44 !sign2 = sign bit
orcc %o1,%o2,%g0 !(-) if either true
bneg,pn %icc,.SPECIAL2 !if (-) goto special cases below
fabsd %f38,%f38 !abs(f) (delay slot)
!----------------------
fpadd32 %f38,%f52,%f0 !intf + 0x00008000 (again)
faddd %f2,%f60,%f2 !p[3]*tmp0 + p[2]
sethi %hi(0x8000),%o7 !rounding bit
fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
add %o0,%o7,%o0 !intf + 0x00008000 (delay slot)
sethi %hi(0x7fff0000),%o7 !mask for rounding argument
fmuld %f38,%f0,%f10 !f*z
fsubd %f38,%f0,%f20 !f - z
and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000
fmuld %f2,%f4,%f2 !(p3*tmp0 + p2)*tmp0
fsubd %f6,%f8,%f6 !conup0 - ansu0
sub %o0,%l7,%o2 !intz - 0x3f900000
fsubd %f10,%f56,%f10 !(f*z - (-1.0))
ldd [%o4+%l4],%f16 !conup1 = __vlibm_TBL_atan1[index1]
cmp %o0,%l6 !(|f| > 64)
ble .ELSE2 !if(|f| > 64) then
sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15
mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower
ba .ENDIF2 !continue
fdivd %f56,%f38,%f38 !f = -1.0/f (delay slot)
.ELSE2: !else f( |x| >= (1/64))
cmp %o0,%l7 !if intf >= 1/64
bl .ENDIF2 !if( |x| >= (1/64) ) then...
mov 0,%o1 !index == 0 , point to conup,conlo = 0,0
add %o3,4,%o1 !index = index + 4
fdivd %f20,%f10,%f38 !f = (f - z)/(1.0 + f*z), reduced argument
.ENDIF2:
sll %o1,3,%l5 !index2 = index
mov %i3,%l2 !yaddr2 = address of y
fmuld %f34,%f4,%f4 !f0*tmp0
faddd %f2,%f62,%f2 !(p3*tmp0 + p2)*tmp0 + p1
fmuld %f36,%f36,%f14 !tmp1= f1*f1
add %o4,%l3,%l3 !base addr + index0
fmuld %f4,%f2,%f2 !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1)
faddd %f6,%f34,%f6 !(conup0 - ansu0) + f0
fmuld %f58,%f14,%f12 !p[3]*tmp1
faddd %f16,%f36,%f18 !ansu1 = conup1 + f1
ldd [%l3+WSIZE],%f4 !conlo0 = __vlibm_TBL_atan1[index0+1]
add %i1,%i2,%i1 !x += stridex
add %i3,%i4,%i3 !y += stridey
faddd %f6,%f2,%f2 !((conup0 - ansu0) + f0) + poly0
faddd %f12,%f60,%f12 !p[3]*tmp1 + p[2]
faddd %f2,%f4,%f2 !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0
fmuld %f12,%f14,%f12 !(p3*tmp1 + p2)*tmp1
fsubd %f16,%f18,%f16 !conup1 - ansu1
faddd %f8,%f2,%f34 !ans0 = ansu0 + ansl0
fmuld %f36,%f14,%f14 !f1*tmp1
faddd %f12,%f62,%f12 !(p3*tmp1 + p2)*tmp1 + p1
for %f34,%f40,%f34 !sign(ans0) = sign of argument
std %f34,[%l0] !*yaddr0 = ans, always gets stored (delay slot)
cmp %i5,0 !if argcount =0, we are done
bg .MAINLOOP
nop
.RETURN:
ret
restore %g0,%g0,%g0
.align 32 !align on I-cache boundary
.SPECIAL0:
orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000
bpos 1f !if >=...continue
sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this)
ba 3f
faddd %f34,%f50,%f30 !dummy op just to generate exception (delay slot)
1:
ld [%o5+4],%o5 !load x lower word
sllx %o0,32,%o0 !left justify intf
sllx %g1,32,%g1 !left justify Inf
or %o0,%o5,%o0 !merge in lower intf
cmp %o0,%g1 !if intf > 0x7ff00000 00000000
ble,pt %xcc,2f !pass thru if NaN
nop
fmuld %f34,%f34,%f34 !...... (x*x) trigger invalid exception
ba 3f
nop
2:
faddd %f46,%f48,%f34 !ans = pi/2 upper + pi/2 lower
3:
add %i1,%i2,%i1 !x += stridex
for %f34,%f40,%f34 !sign(ans) = sign of argument
std %f34,[%i3] !*y = ans
ba .LOOP0 !keep looping
add %i3,%i4,%i3 !y += stridey (delay slot)
.align 32 !align on I-cache boundary
.SPECIAL1:
orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000
bpos 1f !if >=...continue
sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this)
ba 3f
faddd %f36,%f50,%f30 !dummy op just to generate exception (delay slot)
1:
ld [%o5+4],%o5 !load x lower word
sllx %o0,32,%o0 !left justify intf
sllx %g1,32,%g1 !left justify Inf
or %o0,%o5,%o0 !merge in lower intf
cmp %o0,%g1 !if intf > 0x7ff00000 00000000
ble,pt %xcc,2f !pass thru if NaN
nop
fmuld %f36,%f36,%f36 !...... (x*x) trigger invalid exception
ba 3f
nop
2:
faddd %f46,%f48,%f36 !ans = pi/2 upper + pi/2 lower
3:
add %i1,%i2,%i1 !x += stridex
for %f36,%f42,%f36 !sign(ans) = sign of argument
std %f36,[%i3] !*y = ans
ba .LOOP1 !keep looping
add %i3,%i4,%i3 !y += stridey (delay slot)
.align 32 !align on I-cache boundary
.SPECIAL2:
orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000
bpos 1f !if >=...continue
sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this)
ba 3f
faddd %f38,%f50,%f30 !dummy op just to generate exception (delay slot)
1:
ld [%o5+4],%o5 !load x lower word
sllx %o0,32,%o0 !left justify intf
sllx %g1,32,%g1 !left justify Inf
or %o0,%o5,%o0 !merge in lower intf
cmp %o0,%g1 !if intf > 0x7ff00000 00000000
ble,pt %xcc,2f !pass thru if NaN
nop
fmuld %f38,%f38,%f38 !...... (x*x) trigger invalid exception
ba 3f
nop
2:
faddd %f46,%f48,%f38 !ans = pi/2 upper + pi/2 lower
3:
add %i1,%i2,%i1 !x += stridex
for %f38,%f44,%f38 !sign(ans) = sign of argument
std %f38,[%i3] !*y = ans
ba .LOOP2 !keep looping
add %i3,%i4,%i3 !y += stridey
SET_SIZE(__vatan)
! .ident "03-20-96 Sparc V9 3-way-unrolled version"