root/usr/src/lib/libmvec/common/__vhypot.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/isa_defs.h>
#include "libm_inlines.h"

#ifdef _LITTLE_ENDIAN
#define HI(x)   *(1+(int *)x)
#define LO(x)   *(unsigned *)x
#else
#define HI(x)   *(int *)x
#define LO(x)   *(1+(unsigned *)x)
#endif

#ifdef __RESTRICT
#define restrict _Restrict
#else
#define restrict
#endif

/*
 * double hypot(double x, double y)
 *
 * Method :
 *      1. Special cases:
 *              x or y is +Inf or -Inf                          => +Inf
 *              x or y is NaN                                   => QNaN
 *      2. Computes hypot(x,y):
 *              hypot(x,y) = m * sqrt(xnm * xnm + ynm * ynm)
 *      Where:
 *              m = max(|x|,|y|)
 *              xnm = x * (1/m)
 *              ynm = y * (1/m)
 *
 *      Compute xnm * xnm + ynm * ynm by simulating
 *      muti-precision arithmetic.
 *
 * Accuracy:
 *      Maximum error observed: less than 0.872 ulp after 16.777.216.000
 *      results.
 */

extern double sqrt(double);
extern double fabs(double);

static const unsigned long long LCONST[] = {
0x41b0000000000000ULL,  /* D2ON28 = 2 ** 28             */
0x0010000000000000ULL,  /* D2ONM1022 = 2 ** -1022       */
0x7fd0000000000000ULL   /* D2ONP1022 = 2 **  1022       */
};

static void
__vhypot_n(int n, double *restrict px, int stridex, double *restrict py,
    int stridey, double *restrict pz, int stridez);

#define RETURN(ret)                                             \
{                                                               \
        *pz = (ret);                                            \
        py += stridey;                                          \
        pz += stridez;                                          \
        if (n_n == 0)                                           \
        {                                                       \
                hx0 = HI(px);                                   \
                hy0 = HI(py);                                   \
                spx = px;                                       \
                spy = py;                                       \
                spz = pz;                                       \
                continue;                                       \
        }                                                       \
        n--;                                                    \
        break;                                                  \
}

void
__vhypot(int n, double *restrict px, int stridex, double *restrict py,
    int stridey, double *restrict pz, int stridez)
{
        int             hx0, hx1, hy0, j0, diff;
        double          x_hi, x_lo, y_hi, y_lo;
        double          scl = 0;
        double          x, y, res;
        double          *spx, *spy, *spz;
        int             n_n;
        double          D2ON28 = ((double *)LCONST)[0];  /* 2 ** 28     */
        double          D2ONM1022 = ((double *)LCONST)[1]; /* 2 **-1022 */
        double          D2ONP1022 = ((double *)LCONST)[2]; /* 2 ** 1022 */

        while (n > 1) {
                n_n = 0;
                spx = px;
                spy = py;
                spz = pz;
                hx0 = HI(px);
                hy0 = HI(py);
                for (; n > 1; n--) {
                        px += stridex;
                        hx0 &= 0x7fffffff;
                        hy0 &= 0x7fffffff;

                        if (hx0 >= 0x7fe00000) {
                                /* |X| >= 2**1023 or Inf or NaN */
                                diff = hy0 - hx0;
                                j0 = diff >> 31;
                                j0 = hy0 - (diff & j0);
                                j0 &= 0x7ff00000;
                                x = *(px - stridex);
                                y = *py;
                                x = fabs(x);
                                y = fabs(y);
                                if (j0 >= 0x7ff00000) {
                                        /* |X| or |Y| = Inf or NaN */
                                        int lx = LO((px - stridex));
                                        int ly = LO(py);
                                        if (hx0 == 0x7ff00000 && lx == 0)
                                                res = x == y ? y : x;
                                        else if (hy0 == 0x7ff00000 && ly == 0)
                                                res = x == y ? x : y;
                                        else
                                                res = x + y;
                                        RETURN(res)
                                } else {
                                        j0 = diff >> 31;
                                        if (((diff ^ j0) - j0) < 0x03600000) {
                                                /*
                                                 * max(|X|,|Y|)/min(|X|,|Y|) <
                                                 * 2**54
                                                 */
                                                x *= D2ONM1022;
                                                y *= D2ONM1022;

                                                x_hi = (x + D2ON28) - D2ON28;
                                                x_lo = x - x_hi;
                                                y_hi = (y + D2ON28) - D2ON28;
                                                y_lo = y - y_hi;
                                                res = x_hi * x_hi + y_hi * y_hi;
                                                res += ((x + x_hi) * x_lo +
                                                    (y + y_hi) * y_lo);

                                                res = sqrt(res);

                                                res = D2ONP1022 * res;
                                                RETURN(res)
                                        } else {
                                                RETURN(x + y)
                                        }
                                }
                        }
                        if (hy0 >= 0x7fe00000) {
                                /* |Y| >= 2**1023 or Inf or NaN */
                                diff = hy0 - hx0;
                                j0 = diff >> 31;
                                j0 = hy0 - (diff & j0);
                                j0 &= 0x7ff00000;
                                x = *(px - stridex);
                                y = *py;
                                x = fabs(x);
                                y = fabs(y);
                                if (j0 >= 0x7ff00000) {
                                        /* |X| or |Y| = Inf or NaN */
                                        int lx = LO((px - stridex));
                                        int ly = LO(py);
                                        if (hx0 == 0x7ff00000 && lx == 0)
                                                res = x == y ? y : x;
                                        else if (hy0 == 0x7ff00000 && ly == 0)
                                                res = x == y ? x : y;
                                        else
                                                res = x + y;
                                        RETURN(res)
                                } else {
                                        j0 = diff >> 31;
                                        if (((diff ^ j0) - j0) < 0x03600000) {
                                                /*
                                                 * max(|X|,|Y|)/min(|X|,|Y|) <
                                                 * 2**54
                                                 */
                                                x *= D2ONM1022;
                                                y *= D2ONM1022;

                                                x_hi = (x + D2ON28) - D2ON28;
                                                x_lo = x - x_hi;
                                                y_hi = (y + D2ON28) - D2ON28;
                                                y_lo = y - y_hi;
                                                res = x_hi * x_hi + y_hi * y_hi;
                                                res += ((x + x_hi) * x_lo +
                                                    (y + y_hi) * y_lo);

                                                res = sqrt(res);

                                                res = D2ONP1022 * res;
                                                RETURN(res)
                                        } else {
                                                RETURN(x + y)
                                        }
                                }
                        }

                        hx1 = HI(px);

                        if (hx0 < 0x00100000 && hy0 < 0x00100000) {
                                /* X and Y are subnormal */
                                x = *(px - stridex);
                                y = *py;

                                x *= D2ONP1022;
                                y *= D2ONP1022;

                                x_hi = (x + D2ON28) - D2ON28;
                                x_lo = x - x_hi;
                                y_hi = (y + D2ON28) - D2ON28;
                                y_lo = y - y_hi;
                                res = (x_hi * x_hi + y_hi * y_hi);
                                res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);

                                res = sqrt(res);

                                res = D2ONM1022 * res;
                                RETURN(res)
                        }

                        hx0 = hx1;
                        py += stridey;
                        pz += stridez;
                        n_n++;
                        hy0 = HI(py);
                }
                if (n_n > 0)
                        __vhypot_n(n_n, spx, stridex, spy, stridey, spz,
                            stridez);
        }

        if (n > 0) {
                x = *px;
                y = *py;
                hx0 = HI(px);
                hy0 = HI(py);

                hx0 &= 0x7fffffff;
                hy0 &= 0x7fffffff;

                diff = hy0 - hx0;
                j0 = diff >> 31;
                j0 = hy0 - (diff & j0);
                j0 &= 0x7ff00000;

                if (j0 >= 0x7fe00000) {
                        /* max(|X|,|Y|) >= 2**1023 or X or Y = Inf or NaN */
                        x = fabs(x);
                        y = fabs(y);
                        if (j0 >= 0x7ff00000)   /* |X| or |Y| = Inf or NaN */
                        {
                                int lx = LO(px);
                                int ly = LO(py);
                                if (hx0 == 0x7ff00000 && lx == 0)
                                        res = x == y ? y : x;
                                else if (hy0 == 0x7ff00000 && ly == 0)
                                        res = x == y ? x : y;
                                else
                                        res = x + y;
                                *pz = res;
                                return;
                        } else {
                                j0 = diff >> 31;
                                if (((diff ^ j0) - j0) < 0x03600000) {
                                        /* max(|X|,|Y|)/min(|X|,|Y|) < 2**54 */
                                        x *= D2ONM1022;
                                        y *= D2ONM1022;

                                        x_hi = (x + D2ON28) - D2ON28;
                                        x_lo = x - x_hi;
                                        y_hi = (y + D2ON28) - D2ON28;
                                        y_lo = y - y_hi;
                                        res = (x_hi * x_hi + y_hi * y_hi);
                                        res += ((x + x_hi) * x_lo +
                                            (y + y_hi) * y_lo);

                                        res = sqrt(res);

                                        res = D2ONP1022 * res;
                                        *pz = res;
                                        return;
                                } else {
                                        *pz = x + y;
                                        return;
                                }
                        }
                }

                if (j0 < 0x00100000) {  /* X and Y are subnormal */
                        x *= D2ONP1022;
                        y *= D2ONP1022;

                        x_hi = (x + D2ON28) - D2ON28;
                        x_lo = x - x_hi;
                        y_hi = (y + D2ON28) - D2ON28;
                        y_lo = y - y_hi;
                        res = (x_hi * x_hi + y_hi * y_hi);
                        res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);

                        res = sqrt(res);

                        res = D2ONM1022 * res;
                        *pz = res;
                        return;
                }

                HI(&scl) = (0x7fe00000 - j0);

                x *= scl;
                y *= scl;

                x_hi = (x + D2ON28) - D2ON28;
                y_hi = (y + D2ON28) - D2ON28;
                x_lo = x - x_hi;
                y_lo = y - y_hi;

                res = (x_hi * x_hi + y_hi * y_hi);
                res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);

                res = sqrt(res);

                HI(&scl) = j0;

                res = scl * res;
                *pz = res;
        }
}

static void
__vhypot_n(int n, double *restrict px, int stridex, double *restrict py,
    int stridey, double *restrict pz, int stridez)
{
        int             hx0, hy0, j0, diff0;
        double          x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
        double          x0, y0, res0;
        double          D2ON28 = ((double *)LCONST)[0]; /* 2 ** 28 */

        for (; n > 0; n--) {
                x0 = *px;
                y0 = *py;
                hx0 = HI(px);
                hy0 = HI(py);

                hx0 &= 0x7fffffff;
                hy0 &= 0x7fffffff;

                diff0 = hy0 - hx0;
                j0 = diff0 >> 31;
                j0 = hy0 - (diff0 & j0);
                j0 &= 0x7ff00000;

                px += stridex;
                py += stridey;

                HI(&scl0) = (0x7fe00000 - j0);

                x0 *= scl0;
                y0 *= scl0;

                x_hi0 = (x0 + D2ON28) - D2ON28;
                y_hi0 = (y0 + D2ON28) - D2ON28;
                x_lo0 = x0 - x_hi0;
                y_lo0 = y0 - y_hi0;

                res0 = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
                res0 += ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);

                res0 = sqrt(res0);

                HI(&scl0) = j0;

                res0 = scl0 * res0;
                *pz = res0;

                pz += stridez;
        }
}