root/usr/src/lib/libmvec/common/__vatan2f.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#ifdef __RESTRICT
#define restrict _Restrict
#else
#define restrict
#endif

extern const double __vlibm_TBL_atan1[];

static const double
pio4    =  7.8539816339744827900e-01,
pio2    =  1.5707963267948965580e+00,
pi      =  3.1415926535897931160e+00;

static const float
zero    =  0.0f,
one     =  1.0f,
q1      = -3.3333333333296428046e-01f,
q2      =  1.9999999186853752618e-01f,
twop24  =  16777216.0f;

void
__vatan2f(int n, float * restrict y, int stridey, float * restrict x,
        int stridex, float * restrict z, int stridez)
{
        float           x0, x1, x2, y0, y1, y2, *pz0 = 0, *pz1, *pz2;
        double          ah0, ah1, ah2;
        double          t0, t1, t2;
        double          sx0, sx1, sx2;
        double          sign0, sign1, sign2;
        int             i, k0 = 0, k1, k2, hx, sx, sy;
        int             hy0, hy1, hy2;
        float           base0 = 0.0, base1, base2;
        double          num0, num1, num2;
        double          den0, den1, den2;
        double          dx0, dx1, dx2;
        double          dy0, dy1, dy2;
        double          db0, db1, db2;

        do
        {
loop0:
                hy0 = *(int*)y;
                hx = *(int*)x;
                sign0 = one;
                sy = hy0 & 0x80000000;
                hy0 &= ~0x80000000;

                sx = hx & 0x80000000;
                hx &= ~0x80000000;

                if (hy0 > hx)
                {
                        x0 = *y;
                        y0 = *x;
                        i = hx;
                        hx = hy0;
                        hy0 = i;
                        if (sy)
                        {
                                x0 = -x0;
                                sign0 = -sign0;
                        }
                        if (sx)
                        {
                                y0 = -y0;
                                ah0 = pio2;
                        }
                        else
                        {
                                ah0 = -pio2;
                                sign0 = -sign0;
                        }
                }
                else
                {
                        y0 = *y;
                        x0 = *x;
                        if (sy)
                        {
                                y0 = -y0;
                                sign0 = -sign0;
                        }
                        if (sx)
                        {
                                x0 = -x0;
                                ah0 = -pi;
                                sign0 = -sign0;
                        }
                        else
                                ah0 = zero;
                }

                if (hx >= 0x7f800000 || hx - hy0 >= 0x0c800000)
                {
                        if (hx >= 0x7f800000)
                        {
                                if (hx ^ 0x7f800000) /* nan */
                                        ah0 =  x0 + y0;
                                else if (hy0 >= 0x7f800000)
                                        ah0 += pio4;
                        }
                        else if ((int) ah0 == 0)
                                ah0 = y0 / x0;
                        *z = (sign0 == one) ? ah0 : -ah0;
/* sign0*ah0 would change nan behavior relative to previous release */
                        x += stridex;
                        y += stridey;
                        z += stridez;
                        i = 0;
                        if (--n <= 0)
                                break;
                        goto loop0;
                }
                if (hy0 < 0x00800000) {
                        if (hy0 == 0)
                        {
                                *z = sign0 * (float) ah0;
                                x += stridex;
                                y += stridey;
                                z += stridez;
                                i = 0;
                                if (--n <= 0)
                                        break;
                                goto loop0;
                        }
                        y0 *= twop24; /* scale subnormal y */
                        x0 *= twop24; /* scale possibly subnormal x */
                        hy0 = *(int*)&y0;
                        hx = *(int*)&x0;
                }
                pz0 = z;

                k0 = (hy0 - hx + 0x3f800000) & 0xfff80000;
                if (k0 >= 0x3C800000)          /* if |x| >= (1/64)... */
                {
                        *(int*)&base0 = k0;
                        k0 = (k0 - 0x3C800000) >> 18; /* (index >> 19) << 1) */
                        k0 += 4;
                                /* skip over 0,0,pi/2,pi/2 */
                }
                else                            /* |x| < 1/64 */
                {
                        k0 = 0;
                        base0 = zero;
                }

                x += stridex;
                y += stridey;
                z += stridez;
                i = 1;
                if (--n <= 0)
                        break;


loop1:
                hy1 = *(int*)y;
                hx = *(int*)x;
                sign1 = one;
                sy = hy1 & 0x80000000;
                hy1 &= ~0x80000000;

                sx = hx & 0x80000000;
                hx &= ~0x80000000;

                if (hy1 > hx)
                {
                        x1 = *y;
                        y1 = *x;
                        i = hx;
                        hx = hy1;
                        hy1 = i;
                        if (sy)
                        {
                                x1 = -x1;
                                sign1 = -sign1;
                        }
                        if (sx)
                        {
                                y1 = -y1;
                                ah1 = pio2;
                        }
                        else
                        {
                                ah1 = -pio2;
                                sign1 = -sign1;
                        }
                }
                else
                {
                        y1 = *y;
                        x1 = *x;
                        if (sy)
                        {
                                y1 = -y1;
                                sign1 = -sign1;
                        }
                        if (sx)
                        {
                                x1 = -x1;
                                ah1 = -pi;
                                sign1 = -sign1;
                        }
                        else
                                ah1 = zero;
                }

                if (hx >= 0x7f800000 || hx - hy1 >= 0x0c800000)
                {
                        if (hx >= 0x7f800000)
                        {
                                if (hx ^ 0x7f800000) /* nan */
                                        ah1 =  x1 + y1;
                                else if (hy1 >= 0x7f800000)
                                        ah1 += pio4;
                        }
                        else if ((int) ah1 == 0)
                                ah1 = y1 / x1;
                        *z = (sign1 == one)? ah1 : -ah1;
                        x += stridex;
                        y += stridey;
                        z += stridez;
                        i = 1;
                        if (--n <= 0)
                                break;
                        goto loop1;
                }
                if (hy1 < 0x00800000) {
                        if (hy1 == 0)
                        {
                                *z = sign1 * (float) ah1;
                                x += stridex;
                                y += stridey;
                                z += stridez;
                                i = 1;
                                if (--n <= 0)
                                        break;
                                goto loop1;
                        }
                        y1 *= twop24; /* scale subnormal y */
                        x1 *= twop24; /* scale possibly subnormal x */
                        hy1 = *(int*)&y1;
                        hx = *(int*)&x1;
                }
                pz1 = z;

                k1 = (hy1 - hx + 0x3f800000) & 0xfff80000;
                if (k1 >= 0x3C800000)          /* if |x| >= (1/64)... */
                {
                        *(int*)&base1 = k1;
                        k1 = (k1 - 0x3C800000) >> 18; /* (index >> 19) << 1) */
                        k1 += 4;
                                /* skip over 0,0,pi/2,pi/2 */
                }
                else                            /* |x| < 1/64 */
                {
                        k1 = 0;
                        base1 = zero;
                }

                x += stridex;
                y += stridey;
                z += stridez;
                i = 2;
                if (--n <= 0)
                        break;

loop2:
                hy2 = *(int*)y;
                hx = *(int*)x;
                sign2 = one;
                sy = hy2 & 0x80000000;
                hy2 &= ~0x80000000;

                sx = hx & 0x80000000;
                hx &= ~0x80000000;

                if (hy2 > hx)
                {
                        x2 = *y;
                        y2 = *x;
                        i = hx;
                        hx = hy2;
                        hy2 = i;
                        if (sy)
                        {
                                x2 = -x2;
                                sign2 = -sign2;
                        }
                        if (sx)
                        {
                                y2 = -y2;
                                ah2 = pio2;
                        }
                        else
                        {
                                ah2 = -pio2;
                                sign2 = -sign2;
                        }
                }
                else
                {
                        y2 = *y;
                        x2 = *x;
                        if (sy)
                        {
                                y2 = -y2;
                                sign2 = -sign2;
                        }
                        if (sx)
                        {
                                x2 = -x2;
                                ah2 = -pi;
                                sign2 = -sign2;
                        }
                        else
                                ah2 = zero;
                }

                if (hx >= 0x7f800000 || hx - hy2 >= 0x0c800000)
                {
                        if (hx >= 0x7f800000)
                        {
                                if (hx ^ 0x7f800000) /* nan */
                                        ah2 =  x2 + y2;
                                else if (hy2 >= 0x7f800000)
                                        ah2 += pio4;
                        }
                        else if ((int) ah2 == 0)
                                ah2 = y2 / x2;
                        *z = (sign2 == one)? ah2 : -ah2;
                        x += stridex;
                        y += stridey;
                        z += stridez;
                        i = 2;
                        if (--n <= 0)
                                break;
                        goto loop2;
                }
                if (hy2 < 0x00800000) {
                        if (hy2 == 0)
                        {
                                *z = sign2 * (float) ah2;
                                x += stridex;
                                y += stridey;
                                z += stridez;
                                i = 2;
                                if (--n <= 0)
                                        break;
                                goto loop2;
                        }
                        y2 *= twop24; /* scale subnormal y */
                        x2 *= twop24; /* scale possibly subnormal x */
                        hy2 = *(int*)&y2;
                        hx = *(int*)&x2;
                }

                pz2 = z;

                k2 = (hy2 - hx + 0x3f800000) & 0xfff80000;
                if (k2 >= 0x3C800000)          /* if |x| >= (1/64)... */
                {
                        *(int*)&base2 = k2;
                        k2 = (k2 - 0x3C800000) >> 18; /* (index >> 19) << 1) */
                        k2 += 4;
                                /* skip over 0,0,pi/2,pi/2 */
                }
                else                            /* |x| < 1/64 */
                {
                        k2 = 0;
                        base2 = zero;
                }

                goto endloop;

endloop:

                ah2 += __vlibm_TBL_atan1[k2];
                ah1 += __vlibm_TBL_atan1[k1];
                ah0 += __vlibm_TBL_atan1[k0];

                db2 = base2;
                db1 = base1;
                db0 = base0;
                dy2 = y2;
                dy1 = y1;
                dy0 = y0;
                dx2 = x2;
                dx1 = x1;
                dx0 = x0;

                num2 = dy2 - dx2 * db2;
                den2 = dx2 + dy2 * db2;

                num1 = dy1 - dx1 * db1;
                den1 = dx1 + dy1 * db1;

                num0 = dy0 - dx0 * db0;
                den0 = dx0 + dy0 * db0;

                t2 = num2 / den2;
                t1 = num1 / den1;
                t0 = num0 / den0;

                sx2 = t2 * t2;
                sx1 = t1 * t1;
                sx0 = t0 * t0;

                t2 += t2 * sx2 * (q1 + sx2 * q2);
                t1 += t1 * sx1 * (q1 + sx1 * q2);
                t0 += t0 * sx0 * (q1 + sx0 * q2);

                t2 += ah2;
                t1 += ah1;
                t0 += ah0;

                *pz2 = sign2 * t2;
                *pz1 = sign1 * t1;
                *pz0 = sign0 * t0;

                x += stridex;
                y += stridey;
                z += stridez;
                i = 0;
        } while (--n > 0);

        if (i > 1)
        {
                ah1 += __vlibm_TBL_atan1[k1];
                t1 = (y1 - x1 * (double)base1) /
                        (x1 + y1 * (double)base1);
                sx1 = t1 * t1;
                t1 += t1 * sx1 * (q1 + sx1 * q2);
                t1 += ah1;
                *pz1 = sign1 * t1;
        }

        if (i > 0)
        {
                ah0 += __vlibm_TBL_atan1[k0];
                t0 = (y0 - x0 * (double)base0) /
                        (x0 + y0 * (double)base0);
                sx0 = t0 * t0;
                t0 += t0 * sx0 * (q1 + sx0 * q2);
                t0 += ah0;
                *pz0 = sign0 * t0;
        }
}