root/usr/src/lib/libmvec/common/vis/__vrsqrtf.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

        .file   "__vrsqrtf.S"

#include "libm.h"

        RO_DATA
        .align  64

! i = [0,63]
! TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24;
! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
! i = [64,127]
! TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23;
! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));

.CONST_TBL:
        .word   0x3e800000, 0x00000000, 0x3ff6a09e, 0x667f3bcd,
        .word   0x3e7f81f8, 0x1f81f820, 0x3ff673e3, 0x2ef63a03,
        .word   0x3e7f07c1, 0xf07c1f08, 0x3ff6482d, 0x37a5a3d2,
        .word   0x3e7e9131, 0xabf0b767, 0x3ff61d72, 0xb7978671,
        .word   0x3e7e1e1e, 0x1e1e1e1e, 0x3ff5f3aa, 0x673fa911,
        .word   0x3e7dae60, 0x76b981db, 0x3ff5cacb, 0x7802f342,
        .word   0x3e7d41d4, 0x1d41d41d, 0x3ff5a2cd, 0x8c69d61a,
        .word   0x3e7cd856, 0x89039b0b, 0x3ff57ba8, 0xb0ee01b9,
        .word   0x3e7c71c7, 0x1c71c71c, 0x3ff55555, 0x55555555,
        .word   0x3e7c0e07, 0x0381c0e0, 0x3ff52fcc, 0x468d6b54,
        .word   0x3e7bacf9, 0x14c1bad0, 0x3ff50b06, 0xa8fc6b70,
        .word   0x3e7b4e81, 0xb4e81b4f, 0x3ff4e6fd, 0xf33cf032,
        .word   0x3e7af286, 0xbca1af28, 0x3ff4c3ab, 0xe93bcf74,
        .word   0x3e7a98ef, 0x606a63be, 0x3ff4a10a, 0x97af7b92,
        .word   0x3e7a41a4, 0x1a41a41a, 0x3ff47f14, 0x4fe17f9f,
        .word   0x3e79ec8e, 0x951033d9, 0x3ff45dc3, 0xa3c34fa3,
        .word   0x3e799999, 0x9999999a, 0x3ff43d13, 0x6248490f,
        .word   0x3e7948b0, 0xfcd6e9e0, 0x3ff41cfe, 0x93ff5199,
        .word   0x3e78f9c1, 0x8f9c18fa, 0x3ff3fd80, 0x77e70577,
        .word   0x3e78acb9, 0x0f6bf3aa, 0x3ff3de94, 0x8077db58,
        .word   0x3e786186, 0x18618618, 0x3ff3c036, 0x50e00e03,
        .word   0x3e781818, 0x18181818, 0x3ff3a261, 0xba6d7a37,
        .word   0x3e77d05f, 0x417d05f4, 0x3ff38512, 0xba21f51e,
        .word   0x3e778a4c, 0x8178a4c8, 0x3ff36845, 0x766eec92,
        .word   0x3e7745d1, 0x745d1746, 0x3ff34bf6, 0x3d156826,
        .word   0x3e7702e0, 0x5c0b8170, 0x3ff33021, 0x8127c0e0,
        .word   0x3e76c16c, 0x16c16c17, 0x3ff314c3, 0xd92a9e91,
        .word   0x3e768168, 0x16816817, 0x3ff2f9d9, 0xfd52fd50,
        .word   0x3e7642c8, 0x590b2164, 0x3ff2df60, 0xc5df2c9e,
        .word   0x3e760581, 0x60581606, 0x3ff2c555, 0x2988e428,
        .word   0x3e75c988, 0x2b931057, 0x3ff2abb4, 0x3c0eb0f4,
        .word   0x3e758ed2, 0x308158ed, 0x3ff2927b, 0x2cd320f5,
        .word   0x3e755555, 0x55555555, 0x3ff279a7, 0x4590331c,
        .word   0x3e751d07, 0xeae2f815, 0x3ff26135, 0xe91daf55,
        .word   0x3e74e5e0, 0xa72f0539, 0x3ff24924, 0x92492492,
        .word   0x3e74afd6, 0xa052bf5b, 0x3ff23170, 0xd2be638a,
        .word   0x3e747ae1, 0x47ae147b, 0x3ff21a18, 0x51ff630a,
        .word   0x3e7446f8, 0x6562d9fb, 0x3ff20318, 0xcc6a8f5d,
        .word   0x3e741414, 0x14141414, 0x3ff1ec70, 0x124e98f9,
        .word   0x3e73e22c, 0xbce4a902, 0x3ff1d61c, 0x070ae7d3,
        .word   0x3e73b13b, 0x13b13b14, 0x3ff1c01a, 0xa03be896,
        .word   0x3e738138, 0x13813814, 0x3ff1aa69, 0xe4f2777f,
        .word   0x3e73521c, 0xfb2b78c1, 0x3ff19507, 0xecf5b9e9,
        .word   0x3e7323e3, 0x4a2b10bf, 0x3ff17ff2, 0xe00ec3ee,
        .word   0x3e72f684, 0xbda12f68, 0x3ff16b28, 0xf55d72d4,
        .word   0x3e72c9fb, 0x4d812ca0, 0x3ff156a8, 0x72b5ef62,
        .word   0x3e729e41, 0x29e4129e, 0x3ff1426f, 0xac0654db,
        .word   0x3e727350, 0xb8812735, 0x3ff12e7d, 0x02c40253,
        .word   0x3e724924, 0x92492492, 0x3ff11ace, 0xe560242a,
        .word   0x3e721fb7, 0x8121fb78, 0x3ff10763, 0xcec30b26,
        .word   0x3e71f704, 0x7dc11f70, 0x3ff0f43a, 0x45cdedad,
        .word   0x3e71cf06, 0xada2811d, 0x3ff0e150, 0xdce2b60c,
        .word   0x3e71a7b9, 0x611a7b96, 0x3ff0cea6, 0x317186dc,
        .word   0x3e718118, 0x11811812, 0x3ff0bc38, 0xeb8ba412,
        .word   0x3e715b1e, 0x5f75270d, 0x3ff0aa07, 0xbd7b7488,
        .word   0x3e7135c8, 0x1135c811, 0x3ff09811, 0x63615499,
        .word   0x3e711111, 0x11111111, 0x3ff08654, 0xa2d4f6db,
        .word   0x3e70ecf5, 0x6be69c90, 0x3ff074d0, 0x4a8b1438,
        .word   0x3e70c971, 0x4fbcda3b, 0x3ff06383, 0x31ff307a,
        .word   0x3e70a681, 0x0a6810a7, 0x3ff0526c, 0x39213bfa,
        .word   0x3e708421, 0x08421084, 0x3ff0418a, 0x4806de7d,
        .word   0x3e70624d, 0xd2f1a9fc, 0x3ff030dc, 0x4ea03a72,
        .word   0x3e704104, 0x10410410, 0x3ff02061, 0x446ffa9a,
        .word   0x3e702040, 0x81020408, 0x3ff01018, 0x28467ee9,
        .word   0x3e800000, 0x00000000, 0x3ff00000, 0x00000000,
        .word   0x3e7f81f8, 0x1f81f820, 0x3fefc0bd, 0x88a0f1d9,
        .word   0x3e7f07c1, 0xf07c1f08, 0x3fef82ec, 0x882c0f9b,
        .word   0x3e7e9131, 0xabf0b767, 0x3fef467f, 0x2814b0cc,
        .word   0x3e7e1e1e, 0x1e1e1e1e, 0x3fef0b68, 0x48d2af1c,
        .word   0x3e7dae60, 0x76b981db, 0x3feed19b, 0x75e78957,
        .word   0x3e7d41d4, 0x1d41d41d, 0x3fee990c, 0xdad55ed2,
        .word   0x3e7cd856, 0x89039b0b, 0x3fee61b1, 0x38f18adc,
        .word   0x3e7c71c7, 0x1c71c71c, 0x3fee2b7d, 0xddfefa66,
        .word   0x3e7c0e07, 0x0381c0e0, 0x3fedf668, 0x9b7e6350,
        .word   0x3e7bacf9, 0x14c1bad0, 0x3fedc267, 0xbea45549,
        .word   0x3e7b4e81, 0xb4e81b4f, 0x3fed8f72, 0x08e6b82d,
        .word   0x3e7af286, 0xbca1af28, 0x3fed5d7e, 0xa914b937,
        .word   0x3e7a98ef, 0x606a63be, 0x3fed2c85, 0x34ed6d86,
        .word   0x3e7a41a4, 0x1a41a41a, 0x3fecfc7d, 0xa32a9213,
        .word   0x3e79ec8e, 0x951033d9, 0x3feccd60, 0x45f5d358,
        .word   0x3e799999, 0x9999999a, 0x3fec9f25, 0xc5bfedd9,
        .word   0x3e7948b0, 0xfcd6e9e0, 0x3fec71c7, 0x1c71c71c,
        .word   0x3e78f9c1, 0x8f9c18fa, 0x3fec453d, 0x90f057a2,
        .word   0x3e78acb9, 0x0f6bf3aa, 0x3fec1982, 0xb2ece47b,
        .word   0x3e786186, 0x18618618, 0x3febee90, 0x56fb9c39,
        .word   0x3e781818, 0x18181818, 0x3febc460, 0x92eb3118,
        .word   0x3e77d05f, 0x417d05f4, 0x3feb9aed, 0xba588347,
        .word   0x3e778a4c, 0x8178a4c8, 0x3feb7232, 0x5b79db11,
        .word   0x3e7745d1, 0x745d1746, 0x3feb4a29, 0x3c1d9550,
        .word   0x3e7702e0, 0x5c0b8170, 0x3feb22cd, 0x56d87d7e,
        .word   0x3e76c16c, 0x16c16c17, 0x3feafc19, 0xd8606169,
        .word   0x3e768168, 0x16816817, 0x3fead60a, 0x1d0fb394,
        .word   0x3e7642c8, 0x590b2164, 0x3feab099, 0xae8f539a,
        .word   0x3e760581, 0x60581606, 0x3fea8bc4, 0x41a3d02c,
        .word   0x3e75c988, 0x2b931057, 0x3fea6785, 0xb41bacf7,
        .word   0x3e758ed2, 0x308158ed, 0x3fea43da, 0x0adc6899,
        .word   0x3e755555, 0x55555555, 0x3fea20bd, 0x700c2c3e,
        .word   0x3e751d07, 0xeae2f815, 0x3fe9fe2c, 0x315637ee,
        .word   0x3e74e5e0, 0xa72f0539, 0x3fe9dc22, 0xbe484458,
        .word   0x3e74afd6, 0xa052bf5b, 0x3fe9ba9d, 0xa6c73588,
        .word   0x3e747ae1, 0x47ae147b, 0x3fe99999, 0x9999999a,
        .word   0x3e7446f8, 0x6562d9fb, 0x3fe97913, 0x63068b54,
        .word   0x3e741414, 0x14141414, 0x3fe95907, 0xeb87ab44,
        .word   0x3e73e22c, 0xbce4a902, 0x3fe93974, 0x368cfa31,
        .word   0x3e73b13b, 0x13b13b14, 0x3fe91a55, 0x6151761c,
        .word   0x3e738138, 0x13813814, 0x3fe8fba8, 0xa1bf6f96,
        .word   0x3e73521c, 0xfb2b78c1, 0x3fe8dd6b, 0x4563a009,
        .word   0x3e7323e3, 0x4a2b10bf, 0x3fe8bf9a, 0xb06e1af3,
        .word   0x3e72f684, 0xbda12f68, 0x3fe8a234, 0x5cc04426,
        .word   0x3e72c9fb, 0x4d812ca0, 0x3fe88535, 0xd90703c6,
        .word   0x3e729e41, 0x29e4129e, 0x3fe8689c, 0xc7e07e7d,
        .word   0x3e727350, 0xb8812735, 0x3fe84c66, 0xdf0ca4c2,
        .word   0x3e724924, 0x92492492, 0x3fe83091, 0xe6a7f7e7,
        .word   0x3e721fb7, 0x8121fb78, 0x3fe8151b, 0xb86fee1d,
        .word   0x3e71f704, 0x7dc11f70, 0x3fe7fa02, 0x3f1068d1,
        .word   0x3e71cf06, 0xada2811d, 0x3fe7df43, 0x7579b9b5,
        .word   0x3e71a7b9, 0x611a7b96, 0x3fe7c4dd, 0x663ebb88,
        .word   0x3e718118, 0x11811812, 0x3fe7aace, 0x2afa8b72,
        .word   0x3e715b1e, 0x5f75270d, 0x3fe79113, 0xebbd7729,
        .word   0x3e7135c8, 0x1135c811, 0x3fe777ac, 0xde80baea,
        .word   0x3e711111, 0x11111111, 0x3fe75e97, 0x46a0b098,
        .word   0x3e70ecf5, 0x6be69c90, 0x3fe745d1, 0x745d1746,
        .word   0x3e70c971, 0x4fbcda3b, 0x3fe72d59, 0xc45f1fc5,
        .word   0x3e70a681, 0x0a6810a7, 0x3fe7152e, 0x9f44f01f,
        .word   0x3e708421, 0x08421084, 0x3fe6fd4e, 0x79325467,
        .word   0x3e70624d, 0xd2f1a9fc, 0x3fe6e5b7, 0xd16657e1,
        .word   0x3e704104, 0x10410410, 0x3fe6ce69, 0x31d5858d,
        .word   0x3e702040, 0x81020408, 0x3fe6b761, 0x2ec892f6,

        .word   0x3fefffff, 0xfee7f18f  ! K0 =  9.99999997962321453275e-01
        .word   0xbfdfffff, 0xfe07e52f  ! K1 = -4.99999998166077580600e-01
        .word   0x3fd80118, 0x0ca296d9  ! K2 =  3.75066768969515586277e-01
        .word   0xbfd400fc, 0x0bbb8e78  ! K3 = -3.12560092408808548438e-01
        .word   0x7ffe0000, 0x7ffe0000  ! DC0
        .word   0x3f800000, 0x40000000  ! FTWO

#define stridex         %l4
#define stridex2        %l1
#define stridey         %l3
#define stridey2        %i2
#define TBL             %l2
#define counter         %i5

#define K3              %f38
#define K2              %f36
#define K1              %f34
#define K0              %f32
#define DC0             %f4
#define FONE            %f2
#define FTWO            %f3

#define _0x00800000     %o2
#define _0x7f800000     %o4

#define tmp0            STACK_BIAS-0x30
#define tmp1            STACK_BIAS-0x28
#define tmp2            STACK_BIAS-0x20
#define tmp3            STACK_BIAS-0x18
#define tmp_counter     STACK_BIAS-0x10
#define tmp_px          STACK_BIAS-0x08

! sizeof temp storage - must be a multiple of 16 for V9
#define tmps            0x30

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!      !!!!!   algorithm   !!!!!
!  ((float*)&ddx0)[0] = *px;
!  ax0 = *(int*)px;
!
!  ((float*)&ddx0)[1] = *(px + stridex);
!  ax1 = *(int*)(px + stridex);
!
!  px += stridex2;
!
!  if ( ax0 >= 0x7f800000 )
!  {
!    RETURN ( FONE / ((float*)&dres0)[0] );
!  }
!  if ( ax0 < 0x00800000 )
!  {
!    float res = ((float*)&dres0)[0];
!
!    if ( (ax0 & 0x7fffffff) == 0 )  /* |X| = zero  */
!    {
!      RETURN ( FONE / res )
!    }
!    else if ( ax0 >= 0 )  /* X = denormal  */
!    {
!      double    res0, xx0, tbl_div0, tbl_sqrt0;
!      float    fres0;
!      int    iax0, si0, iexp0;
!
!      res = *(int*)&res;
!      res *= FTWO;
!      ax0 = *(int*)&res;
!      iexp0 = ax0 >> 24;
!      iexp0 = 0x3f + 0x4b - iexp0;
!      iexp0 = iexp0 << 23;
!
!      si0 = (ax0 >> 13) & 0x7f0;
!
!      tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
!      tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
!      iax0 = ax0 & 0x7ffe0000;
!      iax0 = ax0 - iax0;
!      xx0 = iax0 * tbl_div0;
!      res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
!
!      fres0 = res0;
!      iexp0 += *(int*)&fres0;
!      RETURN(*(float*)&iexp0)
!    }
!    else  /* X = negative  */
!    {
!      RETURN ( sqrtf(res) )
!    }
!  }
!  if ( ax1 >= 0x7f800000 )
!  {
!    RETURN ( FONE / ((float*)&dres0)[1] )
!  }
!  if ( ax1 < 0x00800000 )
!  {
!    float res = ((float*)&dres0)[1];
!    if ( (ax0 & 0x7fffffff) == 0 )  /* |X| = zero  */
!    {
!      RETURN ( FONE / res )
!    }
!    else if ( ax0 >= 0 )  /* X = denormal  */
!    {
!      double    res0, xx0, tbl_div0, tbl_sqrt0;
!      float    fres0;
!      int    iax1, si0, iexp0;
!
!      res = *(int*)&res;
!      res *= FTWO;
!      ax1 = *(int*)&res;
!      iexp0 = ax1 >> 24;
!      iexp0 = 0x3f + 0x4b - iexp0;
!      iexp0 = iexp0 << 23;
!
!      si0 = (ax1 >> 13) & 0x7f0;
!
!      tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
!      tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
!      iax1 = ax1 & 0x7ffe0000;
!      iax1 = ax1 - iax1;
!      xx0 = iax1 * tbl_div0;
!      res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
!
!      fres0 = res0;
!      iexp0 += *(int*)&fres0;
!      RETURN(*(float*)&iexp0)
!    }
!    else  /* X = negative  */
!    {
!      RETURN ( sqrtf(res) )
!    }
!  }
!
!  iexp0 = ax0 >> 24;
!  iexp1 = ax1 >> 24;
!  iexp0 = 0x3f - iexp0;
!  iexp1 = 0x3f - iexp1;
!  iexp1 &= 0x1ff;
!  lexp0 = iexp0 << 55;
!  lexp1 = iexp1 << 23;
!
!  lexp0 |= lexp1;
!
!  fdx0 = *((double*)&lexp0);
!
!  si0 = ax0 >> 13;
!  si1 = ax1 >> 13;
!  si0 &= 0x7f0;
!  si1 &= 0x7f0;
!
!  addr0 = (char*)TBL + si0;
!  addr1 = (char*)TBL + si1;
!  tbl_div0 = ((double*)((char*)TBL + si0))[0];
!  tbl_div1 = ((double*)((char*)TBL + si1))[0];
!  tbl_sqrt0 = ((double*)addr0)[1];
!  tbl_sqrt1 = ((double*)addr1)[1];
!  dfx0 = vis_fand(ddx0,DC0);
!  dfx0 = vis_fpsub32(ddx0,dfx0);
!  dtmp0 = (double)(((int*)&dfx0)[0]);
!  dtmp1 = (double)(((int*)&dfx0)[1]);
!  xx0 = dtmp0 * tbl_div0;
!  xx1 = dtmp1 * tbl_div1;
!  res0 = K3 * xx0;
!  res1 = K3 * xx1;
!  res0 += K2;
!  res1 += K2;
!  res0 *= xx0;
!  res1 *= xx1;
!  res0 += K1;
!  res1 += K1;
!  res0 *= xx0;
!  res1 *= xx1;
!  res0 += K0;
!  res1 += K0;
!  res0 = tbl_sqrt0 * res0;
!  res1 = tbl_sqrt1 * res1;
!  ((float*)&dres0)[0] = (float)res0;
!  ((float*)&dres0)[1] = (float)res1;
!  dres0 = vis_fpadd32(dres0,fdx0);
!  *py = ((float*)&dres0)[0];
!  *(py + stridey) = ((float*)&dres0)[1];
!  py += stridey2;
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        ENTRY(__vrsqrtf)
        save    %sp,-SA(MINFRAME)-tmps,%sp
        PIC_SETUP(l7)
        PIC_SET(l7,.CONST_TBL,l2)

        st      %i0,[%fp+tmp_counter]
        stx     %i1,[%fp+tmp_px]

        ldd     [TBL+2048],K0
        sll     %i2,2,stridex

        ldd     [TBL+2048+8],K1
        sll     %i4,2,stridey
        mov     %i3,%i2

        ldd     [TBL+2048+16],K2
        sethi   %hi(0x7f800000),_0x7f800000
        sll     stridex,1,stridex2

        ldd     [TBL+2048+24],K3
        sethi   %hi(0x00800000),_0x00800000

        ldd     [TBL+2048+32],DC0
        add     %g0,0x3f,%l0

        ldd     [TBL+2048+40],FONE
!       ld      [TBL+2048+44],FTWO
.begin:
        ld      [%fp+tmp_counter],counter
        ldx     [%fp+tmp_px],%l7
        st      %g0,[%fp+tmp_counter]
.begin1:
        cmp     counter,0
        ble,pn  %icc,.exit

        lda     [%l7]0x82,%f14          ! (4_0) ((float*)&ddx0)[0] = *px;

        lda     [stridex+%l7]0x82,%f15  ! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
        sethi   %hi(0x7ffffc00),%o0

        lda     [%l7]0x82,%g1           ! (4_0) ax0 = *(int*)px;
        add     %l7,stridex2,%i1        ! px += stridex2
        add     %o0,0x3ff,%o0

        lda     [stridex+%l7]0x82,%g5   ! (5_0) ax1 = *(int*)(px + stridex);
        fand    %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);

        sra     %g1,13,%l5              ! (4_0) si0 = ax0 >> 13;
        add     %i1,stridex2,%o5        ! px += stridex2

        cmp     %g1,_0x7f800000         ! (4_1) ax0 ? 0x7f800000
        bge,pn  %icc,.spec0             ! (4_1) if ( ax0 >= 0x7f800000 )
        nop

        cmp     %g1,_0x00800000         ! (4_1) ax0 ? 0x00800000
        bl,pn   %icc,.spec1             ! (4_1) if ( ax0 < 0x00800000 )
        sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;
.cont_spec:
        and     %l5,2032,%l5            ! (4_0) si0 &= 0x7f0;

        ldd     [%l5+TBL],%f54          ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        sra     %g5,24,%l7              ! (5_0) iexp1 = ax1 >> 24;
        and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;
        fpsub32 %f14,%f16,%f16          ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sra     %g1,24,%i3              ! (4_0) iexp0 = ax0 >> 24;
        sub     %l0,%l7,%l7             ! (5_0) iexp1 = 0x3f - iexp1;

        and     %l7,511,%l1             ! (5_0) iexp1 = 0x1ff;
        add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;

        sllx    %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
        sub     %l0,%i3,%o0             ! (4_0) iexp0 = 0x3f - iexp0;
        fitod   %f16,%f56               ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);

        sllx    %o0,55,%o0              ! (4_0) lexp0 = iexp0 << 55;
        fitod   %f17,%f44               ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);

        or      %o0,%l1,%o0             ! (4_0) lexp0 |= lexp1;

        stx     %o0,[%fp+tmp0]          ! (4_0) fdx0 = *((double*)lexp0);

        fmuld   %f56,%f54,%f40          ! (4_0) xx0 = dtmp0 * tbl_div0;

        lda     [%i1]0x82,%f18          ! (0_0) ((float*)&ddx0)[0] = *px;
        fmuld   %f44,%f46,%f46          ! (5_1) xx1 = dtmp1 * tbl_div1;

        lda     [stridex+%i1]0x82,%f19  ! (1_0) ((float*)&ddx0)[1] = *(px + stridex);

        lda     [%i1]0x82,%g1           ! (0_0) ax0 = *(int*)px;

        lda     [stridex+%i1]0x82,%i4   ! (1_0) ax1 = *(int*)(px + stridex);
        cmp     %g5,_0x7f800000         ! (5_1) ax1 ? 0x7f800000
        bge,pn  %icc,.update0           ! (5_1) if ( ax1 >= 0x7f800000 )
        fmuld   K3,%f40,%f52            ! (4_1) res0 = K3 * xx0;
.cont0:
        fmuld   K3,%f46,%f50            ! (5_1) res1 = K3 * xx1;
        cmp     %g5,_0x00800000         ! (5_1) ax1 ? 0x00800000
        bl,pn   %icc,.update1           ! (5_1) if ( ax1 < 0x00800000 )
        fand    %f18,DC0,%f56           ! (0_0) dfx0 = vis_fand(ddx0,DC0);
.cont1:
        sra     %g1,13,%o0              ! (0_0) si0 = ax0 >> 13;
        cmp     %g1,_0x7f800000         ! (0_0) ax0 ? 0x7f800000

        sra     %i4,13,%g5              ! (1_0) si1 = ax1 >> 13;
        and     %o0,2032,%o0            ! (0_0) si0 &= 0x7f0;

        ldd     [%o0+TBL],%f54          ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        sra     %i4,24,%i1              ! (1_0) iexp1 = ax1 >> 24;
        and     %g5,2032,%o7            ! (1_0) si1 &= 0x7f0;
        fpsub32 %f18,%f56,%f30          ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%o7+TBL],%f44          ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sra     %g1,24,%i3              ! (0_0) iexp0 = ax0 >> 24;
        sub     %l0,%i1,%i1             ! (1_0) iexp1 = 0x3f - iexp1;
        faddd   %f52,K2,%f62            ! (4_1) res0 += K2;

        sub     %l0,%i3,%g5             ! (0_0) iexp0 = 0x3f - iexp0;
        bge,pn  %icc,.update2           ! (0_0) if ( ax0 >= 0x7f800000 )
        faddd   %f50,K2,%f60            ! (5_1) res1 += K2;
.cont2:
        cmp     %g1,_0x00800000         ! (0_0) ax0 ? 0x00800000
        and     %i1,511,%i0             ! (1_0) iexp1 = 0x1ff;
        fitod   %f30,%f56               ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);

        sllx    %i0,23,%i0              ! (1_0) lexp1 = iexp1 << 23;
        bl,pn   %icc,.update3           ! (0_0) if ( ax0 < 0x00800000 )
        fitod   %f31,%f50               ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
.cont3:
        fmuld   %f62,%f40,%f30          ! (4_1) res0 *= xx0;
        sllx    %g5,55,%g5              ! (0_0) lexp0 = iexp0 << 55;

        fmuld   %f60,%f46,%f48          ! (5_1) res1 *= xx1;
        or      %g5,%i0,%g5             ! (0_0) lexp0 |= lexp1;
        stx     %g5,[%fp+tmp1]          ! (0_0) fdx0 = *((double*)lexp0);

        fmuld   %f56,%f54,%f26          ! (0_0) xx0 = dtmp0 * tbl_div0;
        sll     stridex,1,stridex2      ! stridex2 = stridex * 2;

        lda     [%o5]0x82,%f24          ! (2_0) ((float*)&ddx0)[0] = *px;
        add     %o7,TBL,%o7             ! (1_0) addr0 = (char*)TBL + si0;
        fmuld   %f50,%f44,%f44          ! (1_0) xx0 = dtmp0 * tbl_div0;

        lda     [stridex+%o5]0x82,%f25  ! (3_0) ((float*)&ddx0)[1] = *(px + stridex);
        add     %l5,TBL,%l5             ! (4_1) addr0 = (char*)TBL + si0;
        faddd   %f30,K1,%f62            ! (4_1) res0 += K1;

        lda     [%o5]0x82,%g1           ! (2_0) ax0 = *(int*)px;
        add     %o5,stridex2,%l7        ! px += stridex2
        faddd   %f48,K1,%f42            ! (5_1) res1 += K1;

        lda     [stridex+%o5]0x82,%o5   ! (3_0) ax1 = *(int*)(px + stridex);
        cmp     %i4,_0x7f800000         ! (1_0) ax1 ? 0x7f800000
        bge,pn  %icc,.update4           ! (1_0) if ( ax1 >= 0x7f800000 )
        fmuld   K3,%f26,%f52            ! (0_0) res0 = K3 * xx0;
.cont4:
        fmuld   K3,%f44,%f50            ! (1_0) res1 = K3 * xx1;
        cmp     %i4,_0x00800000         ! (1_0) ax1 ? 0x00800000
        bl,pn   %icc,.update5           ! (1_0) if ( ax1 < 0x00800000 )
        fand    %f24,DC0,%f54           ! (2_0) dfx0 = vis_fand(ddx0,DC0);
.cont5:
        fmuld   %f62,%f40,%f48          ! (4_1) res0 *= xx0;
        sra     %g1,13,%i0              ! (2_0) si0 = ax0 >> 13;
        cmp     %g1,_0x7f800000         ! (2_0) ax0 ? 0x7f800000

        fmuld   %f42,%f46,%f58          ! (5_1) res1 *= xx1;
        sra     %o5,13,%o1              ! (3_0) si1 = ax1 >> 13;
        and     %i0,2032,%i0            ! (2_0) si0 &= 0x7f0;

        ldd     [%i0+TBL],%f30          ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        sra     %o5,24,%o3              ! (3_0) iexp1 = ax1 >> 24;
        and     %o1,2032,%o1            ! (3_0) si1 &= 0x7f0;
        fpsub32 %f24,%f54,%f12          ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%o1+TBL],%f46          ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sra     %g1,24,%i3              ! (2_0) iexp0 = ax0 >> 24;
        sub     %l0,%o3,%o3             ! (3_0) iexp1 = 0x3f - iexp1;
        faddd   %f52,K2,%f40            ! (0_0) res0 += K2;

        ldd     [%l5+8],%f42            ! (4_1) tbl_sqrt0 = ((double*)addr0)[1];
        sub     %l0,%i3,%g5             ! (2_0) iexp0 = 0x3f - iexp0;
        and     %o3,511,%i3             ! (3_0) iexp1 &= 0x1ff;
        faddd   %f50,K2,%f60            ! (1_0) res0 += K2;

        ldd     [%l6+8],%f28            ! (5_1) tbl_sqrt1 = ((double*)addr1)[1];
        sllx    %g5,55,%g5              ! (2_0) lexp0 = iexp0 << 55;
        add     %i0,TBL,%i0             ! (2_0) addr0 = (char*)TBL + si0;
        fitod   %f12,%f56               ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);

        sllx    %i3,23,%i3              ! (3_0) lexp1 = iexp1 << 23;
        fitod   %f13,%f50               ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);

        fmuld   %f40,%f26,%f40          ! (0_0) res0 *= xx0;
        or      %g5,%i3,%g5             ! (2_0) lexp0 |= lexp1;
        faddd   %f48,K0,%f62            ! (4_1) res0 += K0;

        fmuld   %f60,%f44,%f48          ! (1_0) res1 *= xx1;
        add     %o1,TBL,%o1             ! (3_0) addr1 = (char*)TBL + si1;
        stx     %g5,[%fp+tmp2]          ! (2_0) fdx0 = *((double*)lexp0);
        faddd   %f58,K0,%f60            ! (5_1) res1 += K0;

        fmuld   %f56,%f30,%f30          ! (2_0) xx0 = dtmp0 * tbl_div0;
        bge,pn  %icc,.update6           ! (2_0) if ( ax0 >= 0x7f800000 )
        lda     [%l7]0x82,%f14          ! (4_0) ((float*)&ddx0)[0] = *px;
.cont6:
        cmp     %g1,_0x00800000         ! (2_0) ax0 ? 0x00800000
        bl,pn   %icc,.update7           ! (2_0) if ( ax0 < 0x00800000 )
        nop
.cont7:
        fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;

        lda     [stridex+%l7]0x82,%f15  ! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
        cmp     %o5,_0x7f800000         ! (3_0) ax1 ? 0x7f800000
        fmuld   %f42,%f62,%f58          ! (4_1) res0 = tbl_sqrt0 * res0;
        faddd   %f40,K1,%f46            ! (0_0) res0 += K1;

        lda     [%l7]0x82,%g1           ! (4_0) ax0 = *(int*)px;
        add     %l7,stridex2,%i1        ! px += stridex2
        fmuld   %f28,%f60,%f56          ! (5_1) res1 = tbl_sqrt1 * res1;
        faddd   %f48,K1,%f62            ! (1_0) res1 += K1;

        lda     [stridex+%l7]0x82,%g5   ! (5_0) ax1 = *(int*)(px + stridex);
        add     %o0,TBL,%o0             ! (0_0) addr0 = (char*)TBL + si0;
        bge,pn  %icc,.update8           ! (3_0) if ( ax1 >= 0x7f800000 )
        fmuld   K3,%f30,%f52            ! (2_0) res0 = K3 * xx0;
.cont8:
        fmuld   K3,%f24,%f50            ! (3_0) res1 = K3 * xx1;
        cmp     %o5,_0x00800000         ! (3_0) ax1 ? 0x00800000
        bl,pn   %icc,.update9           ! (3_0) if ( ax1 < 0x00800000 )
        fand    %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
.cont9:
        fmuld   %f46,%f26,%f48          ! (0_0) res0 *= xx0;
        sra     %g1,13,%l5              ! (4_0) si0 = ax0 >> 13;
        add     %i1,stridex2,%o5        ! px += stridex2
        fdtos   %f58,%f6                ! (4_1) ((float*)&dres0)[0] = (float)res0;

        fmuld   %f62,%f44,%f40          ! (1_0) res1 *= xx1;
        sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;
        and     %l5,2032,%l5            ! (4_0) si0 &= 0x7f0;
        fdtos   %f56,%f7                ! (5_1) ((float*)&dres0)[1] = (float)res1;

        ldd     [%l5+TBL],%f54          ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        sra     %g5,24,%l7              ! (5_0) iexp1 = ax1 >> 24;
        and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;
        fpsub32 %f14,%f16,%f16          ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sra     %g1,24,%i3              ! (4_0) iexp0 = ax0 >> 24;
        sub     %l0,%l7,%l7             ! (5_0) iexp1 = 0x3f - iexp1;
        faddd   %f52,K2,%f58            ! (2_0) res0 += K2;

        ldd     [%o0+8],%f42            ! (0_0) tbl_sqrt0 = ((double*)addr0)[1];
        and     %l7,511,%l1             ! (5_0) iexp1 = 0x1ff;
        add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;
        faddd   %f50,K2,%f60            ! (3_0) res1 += K2;

        ldd     [%o7+8],%f28            ! (1_0) tbl_sqrt1 = ((double*)addr1)[1];
        sllx    %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
        sub     %l0,%i3,%o0             ! (4_0) iexp0 = 0x3f - iexp0;
        fitod   %f16,%f56               ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);

        ldd     [%fp+tmp0],%f52         ! (4_1) fdx0 = *((double*)lexp0);
        sllx    %o0,55,%o0              ! (4_0) lexp0 = iexp0 << 55;
        fitod   %f17,%f44               ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);

        fmuld   %f58,%f30,%f62          ! (2_0) res0 *= xx0;
        or      %o0,%l1,%o0             ! (4_0) lexp0 |= lexp1;
        faddd   %f48,K0,%f22            ! (0_0) res0 += K0;

        fmuld   %f60,%f24,%f58          ! (3_0) res1 *= xx1;
        stx     %o0,[%fp+tmp0]          ! (4_0) fdx0 = *((double*)lexp0);
        faddd   %f40,K0,%f26            ! (1_0) res1 += K0;

        fmuld   %f56,%f54,%f40          ! (4_0) xx0 = dtmp0 * tbl_div0;
        fpadd32 %f6,%f52,%f10           ! (4_1) dres0 = vis_fpadd32(dres0,fdx0);

        or      %g0,%i2,%l7
        add     stridey,stridey,stridey2

        cmp     counter,6
        bl,pn   %icc,.tail
        nop

        ba      .main_loop
        sub     counter,6,counter       ! counter

        .align  16
.main_loop:
        lda     [%i1]0x82,%f18          ! (0_0) ((float*)&ddx0)[0] = *px;
        cmp     %g1,_0x7f800000         ! (4_1) ax0 ? 0x7f800000
        bge,pn  %icc,.update10          ! (4_1) if ( ax0 >= 0x7f800000 )
        fmuld   %f44,%f46,%f46          ! (5_1) xx1 = dtmp1 * tbl_div1;
.cont10:
        lda     [stridex+%i1]0x82,%f19  ! (1_0) ((float*)&ddx0)[1] = *(px + stridex);
        cmp     %g1,_0x00800000         ! (4_1) ax0 ? 0x00800000
        fmuld   %f42,%f22,%f44          ! (0_1) res0 = tbl_sqrt0 * res0;
        faddd   %f62,K1,%f42            ! (2_1) res0 += K1;

        lda     [%i1]0x82,%g1           ! (0_0) ax0 = *(int*)px;
        fmuld   %f28,%f26,%f60          ! (1_1) res1 = tbl_sqrt1 * res1;
        bl,pn   %icc,.update11          ! (4_1) if ( ax0 < 0x00800000 )
        faddd   %f58,K1,%f62            ! (3_1) res1 += K1;
.cont11:
        lda     [stridex+%i1]0x82,%i4   ! (1_0) ax1 = *(int*)(px + stridex);
        cmp     %g5,_0x7f800000         ! (5_1) ax1 ? 0x7f800000
        bge,pn  %icc,.update12          ! (5_1) if ( ax1 >= 0x7f800000 )
        fmuld   K3,%f40,%f52            ! (4_1) res0 = K3 * xx0;
.cont12:
        fmuld   K3,%f46,%f50            ! (5_1) res1 = K3 * xx1;
        cmp     %g5,_0x00800000         ! (5_1) ax1 ? 0x00800000
        bl,pn   %icc,.update13          ! (5_1) if ( ax1 < 0x00800000 )
        fand    %f18,DC0,%f56           ! (0_0) dfx0 = vis_fand(ddx0,DC0);
.cont13:
        fmuld   %f42,%f30,%f48          ! (2_1) res0 *= xx0;
        sra     %g1,13,%o0              ! (0_0) si0 = ax0 >> 13;
        cmp     %g1,_0x7f800000         ! (0_0) ax0 ? 0x7f800000
        fdtos   %f44,%f8                ! (0_1) ((float*)&dres0)[0] = (float)res0;

        fmuld   %f62,%f24,%f58          ! (3_1) res1 *= xx1;
        sra     %i4,13,%g5              ! (1_0) si1 = ax1 >> 13;
        and     %o0,2032,%o0            ! (0_0) si0 &= 0x7f0;
        fdtos   %f60,%f9                ! (1_1) ((float*)&dres0)[1] = (float)res1;

        ldd     [%o0+TBL],%f54          ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        sra     %i4,24,%i1              ! (1_0) iexp1 = ax1 >> 24;
        and     %g5,2032,%o7            ! (1_0) si1 &= 0x7f0;
        fpsub32 %f18,%f56,%f30          ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%o7+TBL],%f44          ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sra     %g1,24,%i3              ! (0_0) iexp0 = ax0 >> 24;
        sub     %l0,%i1,%i1             ! (1_0) iexp1 = 0x3f - iexp1;
        faddd   %f52,K2,%f62            ! (4_1) res0 += K2;

        ldd     [%i0+8],%f42            ! (2_1) tbl_sqrt0 = ((double*)addr0)[1];
        sub     %l0,%i3,%g5             ! (0_0) iexp0 = 0x3f - iexp0;
        bge,pn  %icc,.update14          ! (0_0) if ( ax0 >= 0x7f800000 )
        faddd   %f50,K2,%f60            ! (5_1) res1 += K2;
.cont14:
        ldd     [%o1+8],%f28            ! (3_1) tbl_sqrt1 = ((double*)addr0)[1];
        cmp     %g1,_0x00800000         ! (0_0) ax0 ? 0x00800000
        and     %i1,511,%i0             ! (1_0) iexp1 = 0x1ff;
        fitod   %f30,%f56               ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);

        ldd     [%fp+tmp1],%f52         ! (0_1) fdx0 = *((double*)lexp0);
        sllx    %i0,23,%i0              ! (1_0) lexp1 = iexp1 << 23;
        bl,pn   %icc,.update15          ! (0_0) if ( ax0 < 0x00800000 )
        fitod   %f31,%f50               ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
.cont15:
        fmuld   %f62,%f40,%f30          ! (4_1) res0 *= xx0;
        sllx    %g5,55,%g5              ! (0_0) lexp0 = iexp0 << 55;
        st      %f10,[%l7]              ! (4_2) *py = ((float*)&dres0)[0];
        faddd   %f48,K0,%f62            ! (2_1) res0 += K0;

        fmuld   %f60,%f46,%f48          ! (5_1) res1 *= xx1;
        or      %g5,%i0,%g5             ! (0_0) lexp0 |= lexp1;
        stx     %g5,[%fp+tmp1]          ! (0_0) fdx0 = *((double*)lexp0);
        faddd   %f58,K0,%f60            ! (3_1) res1 += K0;

        fmuld   %f56,%f54,%f26          ! (0_0) xx0 = dtmp0 * tbl_div0;
        sll     stridex,1,stridex2      ! stridex2 = stridex * 2;
        st      %f11,[stridey+%l7]      ! (5_2) *(py + stridey) = ((float*)&dres0)[1];
        fpadd32 %f8,%f52,%f10           ! (0_1) dres0 = vis_fpadd32(dres0,fdx0);

        lda     [%o5]0x82,%f24          ! (2_0) ((float*)&ddx0)[0] = *px;
        add     %l7,stridey2,%i1        ! py += stridey2
        add     %o7,TBL,%o7             ! (1_0) addr0 = (char*)TBL + si0;
        fmuld   %f50,%f44,%f44          ! (1_0) xx0 = dtmp0 * tbl_div0;

        lda     [stridex+%o5]0x82,%f25  ! (3_0) ((float*)&ddx0)[1] = *(px + stridex);
        add     %l5,TBL,%l5             ! (4_1) addr0 = (char*)TBL + si0;
        fmuld   %f42,%f62,%f58          ! (2_1) res0 = tbl_sqrt0 * res0;
        faddd   %f30,K1,%f62            ! (4_1) res0 += K1;

        lda     [%o5]0x82,%g1           ! (2_0) ax0 = *(int*)px;
        add     %o5,stridex2,%l7        ! px += stridex2
        fmuld   %f28,%f60,%f56          ! (3_1) res1 = tbl_sqrt1 * res1;
        faddd   %f48,K1,%f42            ! (5_1) res1 += K1;

        lda     [stridex+%o5]0x82,%o5   ! (3_0) ax1 = *(int*)(px + stridex);
        cmp     %i4,_0x7f800000         ! (1_0) ax1 ? 0x7f800000
        bge,pn  %icc,.update16          ! (1_0) if ( ax1 >= 0x7f800000 )
        fmuld   K3,%f26,%f52            ! (0_0) res0 = K3 * xx0;
.cont16:
        fmuld   K3,%f44,%f50            ! (1_0) res1 = K3 * xx1;
        cmp     %i4,_0x00800000         ! (1_0) ax1 ? 0x00800000
        bl,pn   %icc,.update17          ! (1_0) if ( ax1 < 0x00800000 )
        fand    %f24,DC0,%f54           ! (2_0) dfx0 = vis_fand(ddx0,DC0);
.cont17:
        fmuld   %f62,%f40,%f48          ! (4_1) res0 *= xx0;
        sra     %g1,13,%i0              ! (2_0) si0 = ax0 >> 13;
        cmp     %g1,_0x7f800000         ! (2_0) ax0 ? 0x7f800000
        fdtos   %f58,%f20               ! (2_1) ((float*)&dres0)[0] = (float)res0;

        fmuld   %f42,%f46,%f58          ! (5_1) res1 *= xx1;
        sra     %o5,13,%o1              ! (3_0) si1 = ax1 >> 13;
        and     %i0,2032,%i0            ! (2_0) si0 &= 0x7f0;
        fdtos   %f56,%f21               ! (3_1) ((float*)&dres0)[0] = (float)res0;

        ldd     [%i0+TBL],%f30          ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        sra     %o5,24,%o3              ! (3_0) iexp1 = ax1 >> 24;
        and     %o1,2032,%o1            ! (3_0) si1 &= 0x7f0;
        fpsub32 %f24,%f54,%f12          ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%o1+TBL],%f46          ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sra     %g1,24,%i3              ! (2_0) iexp0 = ax0 >> 24;
        sub     %l0,%o3,%o3             ! (3_0) iexp1 = 0x3f - iexp1;
        faddd   %f52,K2,%f40            ! (0_0) res0 += K2;

        ldd     [%l5+8],%f42            ! (4_1) tbl_sqrt0 = ((double*)addr0)[1];
        sub     %l0,%i3,%g5             ! (2_0) iexp0 = 0x3f - iexp0;
        and     %o3,511,%i3             ! (3_0) iexp1 &= 0x1ff;
        faddd   %f50,K2,%f60            ! (1_0) res0 += K2;

        ldd     [%l6+8],%f28            ! (5_1) tbl_sqrt1 = ((double*)addr1)[1];
        sllx    %g5,55,%g5              ! (2_0) lexp0 = iexp0 << 55;
        add     %i0,TBL,%i0             ! (2_0) addr0 = (char*)TBL + si0;
        fitod   %f12,%f56               ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);

        ldd     [%fp+tmp2],%f52         ! (2_1) fdx0 = *((double*)lexp0);
        sllx    %i3,23,%i3              ! (3_0) lexp1 = iexp1 << 23;
        add     %i1,stridey2,%o3        ! py += stridey2
        fitod   %f13,%f50               ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);

        fmuld   %f40,%f26,%f40          ! (0_0) res0 *= xx0;
        or      %g5,%i3,%g5             ! (2_0) lexp0 |= lexp1;
        st      %f10,[%i1]              ! (0_1) *py = ((float*)&dres0)[0];
        faddd   %f48,K0,%f62            ! (4_1) res0 += K0;

        fmuld   %f60,%f44,%f48          ! (1_0) res1 *= xx1;
        add     %o1,TBL,%o1             ! (3_0) addr1 = (char*)TBL + si1;
        stx     %g5,[%fp+tmp2]          ! (2_0) fdx0 = *((double*)lexp0);
        faddd   %f58,K0,%f60            ! (5_1) res1 += K0;

        fmuld   %f56,%f30,%f30          ! (2_0) xx0 = dtmp0 * tbl_div0;
        bge,pn  %icc,.update18          ! (2_0) if ( ax0 >= 0x7f800000 )
        st      %f11,[stridey+%i1]      ! (1_1) *(py + stridey) = ((float*)&dres0)[1];
        fpadd32 %f20,%f52,%f0           ! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
.cont18:
        cmp     %g1,_0x00800000         ! (2_0) ax0 ? 0x00800000
        bl,pn   %icc,.update19          ! (2_0) if ( ax0 < 0x00800000 )
        lda     [%l7]0x82,%f14          ! (4_0) ((float*)&ddx0)[0] = *px;
        fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;
.cont19:
        lda     [stridex+%l7]0x82,%f15  ! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
        cmp     %o5,_0x7f800000         ! (3_0) ax1 ? 0x7f800000
        fmuld   %f42,%f62,%f58          ! (4_1) res0 = tbl_sqrt0 * res0;
        faddd   %f40,K1,%f46            ! (0_0) res0 += K1;

        lda     [%l7]0x82,%g1           ! (4_0) ax0 = *(int*)px;
        add     %l7,stridex2,%i1        ! px += stridex2
        fmuld   %f28,%f60,%f56          ! (5_1) res1 = tbl_sqrt1 * res1;
        faddd   %f48,K1,%f62            ! (1_0) res1 += K1;

        lda     [stridex+%l7]0x82,%g5   ! (5_0) ax1 = *(int*)(px + stridex);
        add     %o0,TBL,%o0             ! (0_0) addr0 = (char*)TBL + si0;
        bge,pn  %icc,.update20          ! (3_0) if ( ax1 >= 0x7f800000 )
        fmuld   K3,%f30,%f52            ! (2_0) res0 = K3 * xx0;
.cont20:
        fmuld   K3,%f24,%f50            ! (3_0) res1 = K3 * xx1;
        cmp     %o5,_0x00800000         ! (3_0) ax1 ? 0x00800000
        bl,pn   %icc,.update21          ! (3_0) if ( ax1 < 0x00800000 )
        fand    %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
.cont21:
        fmuld   %f46,%f26,%f48          ! (0_0) res0 *= xx0;
        sra     %g1,13,%l5              ! (4_0) si0 = ax0 >> 13;
        add     %i1,stridex2,%o5        ! px += stridex2
        fdtos   %f58,%f6                ! (4_1) ((float*)&dres0)[0] = (float)res0;

        fmuld   %f62,%f44,%f40          ! (1_0) res1 *= xx1;
        sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;
        and     %l5,2032,%l5            ! (4_0) si0 &= 0x7f0;
        fdtos   %f56,%f7                ! (5_1) ((float*)&dres0)[1] = (float)res1;

        ldd     [%l5+TBL],%f54          ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        sra     %g5,24,%l7              ! (5_0) iexp1 = ax1 >> 24;
        and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;
        fpsub32 %f14,%f16,%f16          ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sra     %g1,24,%i3              ! (4_0) iexp0 = ax0 >> 24;
        sub     %l0,%l7,%l7             ! (5_0) iexp1 = 0x3f - iexp1;
        faddd   %f52,K2,%f58            ! (2_0) res0 += K2;

        ldd     [%o0+8],%f42            ! (0_0) tbl_sqrt0 = ((double*)addr0)[1];
        and     %l7,511,%l1             ! (5_0) iexp1 = 0x1ff;
        add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;
        faddd   %f50,K2,%f60            ! (3_0) res1 += K2;

        ldd     [%o7+8],%f28            ! (1_0) tbl_sqrt1 = ((double*)addr1)[1];
        sllx    %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
        sub     %l0,%i3,%o0             ! (4_0) iexp0 = 0x3f - iexp0;
        fitod   %f16,%f56               ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);

        ldd     [%fp+tmp0],%f52         ! (4_1) fdx0 = *((double*)lexp0);
        sllx    %o0,55,%o0              ! (4_0) lexp0 = iexp0 << 55;
        add     %o3,stridey2,%l7        ! py += stridey2
        fitod   %f17,%f44               ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);

        fmuld   %f58,%f30,%f62          ! (2_0) res0 *= xx0;
        or      %o0,%l1,%o0             ! (4_0) lexp0 |= lexp1;
        st      %f0,[%o3]               ! (2_1) *py = ((float*)&dres0)[0];
        faddd   %f48,K0,%f22            ! (0_0) res0 += K0;

        fmuld   %f60,%f24,%f58          ! (3_0) res1 *= xx1;
        subcc   counter,6,counter       ! counter -= 6;
        stx     %o0,[%fp+tmp0]          ! (4_0) fdx0 = *((double*)lexp0);
        faddd   %f40,K0,%f26            ! (1_0) res1 += K0;

        fmuld   %f56,%f54,%f40          ! (4_0) xx0 = dtmp0 * tbl_div0;
        st      %f1,[stridey+%o3]       ! (3_1) *(py + stridey) = ((float*)&dres0)[1];
        bpos,pt %icc,.main_loop
        fpadd32 %f6,%f52,%f10           ! (4_1) dres0 = vis_fpadd32(dres0,fdx0);

        add     counter,6,counter
.tail:
        sll     stridex,1,stridex2
        subcc   counter,1,counter
        bneg,a  .begin
        mov     %l7,%i2

        fmuld   %f42,%f22,%f44          ! (0_1) res0 = tbl_sqrt0 * res0;
        faddd   %f62,K1,%f42            ! (2_1) res0 += K1;

        fmuld   %f28,%f26,%f60          ! (1_1) res1 = tbl_sqrt1 * res1;

        fmuld   %f42,%f30,%f48          ! (2_1) res0 *= xx0;
        fdtos   %f44,%f8                ! (0_1) ((float*)&dres0)[0] = (float)res0;

        fdtos   %f60,%f9                ! (1_1) ((float*)&dres0)[1] = (float)res1;

        ldd     [%i0+8],%f42            ! (2_1) tbl_sqrt0 = ((double*)addr0)[1];

        ldd     [%fp+tmp1],%f52         ! (0_1) fdx0 = *((double*)lexp0);

        st      %f10,[%l7]              ! (4_2) *py = ((float*)&dres0)[0];
        subcc   counter,1,counter
        bneg,a  .begin
        add     %l7,stridey,%i2

        faddd   %f48,K0,%f62            ! (2_1) res0 += K0;
        st      %f11,[stridey+%l7]      ! (5_2) *(py + stridey) = ((float*)&dres0)[1];
        subcc   counter,1,counter
        bneg,a  .begin
        add     %l7,stridey2,%i2
        fpadd32 %f8,%f52,%f10           ! (0_1) dres0 = vis_fpadd32(dres0,fdx0);

        add     %l7,stridey2,%i1        ! py += stridey2

        fmuld   %f42,%f62,%f58          ! (2_1) res0 = tbl_sqrt0 * res0;

        fdtos   %f58,%f20               ! (2_1) ((float*)&dres0)[0] = (float)res0;

        ldd     [%fp+tmp2],%f52         ! (2_1) fdx0 = *((double*)lexp0);
        add     %i1,stridey2,%o3        ! py += stridey2

        st      %f10,[%i1]              ! (0_1) *py = ((float*)&dres0)[0];
        subcc   counter,1,counter
        bneg,a  .begin
        add     %i1,stridey,%i2

        st      %f11,[stridey+%i1]      ! (1_1) *(py + stridey) = ((float*)&dres0)[1];
        subcc   counter,1,counter
        bneg,a  .begin
        mov     %o3,%i2
        fpadd32 %f20,%f52,%f0           ! (2_1) dres0 = vis_fpadd32(dres0,fdx0);

        st      %f0,[%o3]               ! (2_1) *py = ((float*)&dres0)[0];
        ba      .begin
        add     %o3,stridey,%i2

        .align  16
.spec0:
        fdivs   FONE,%f14,%f14          ! x0 = FONE / x0;
        add     %l7,stridex,%l7         ! px += stridex
        st      %f14,[%i2]              ! *py = x0;
        sub     counter,1,counter
        ba      .begin1
        add     %i2,stridey,%i2         ! py += stridey

        .align  16
.spec1:
        andcc   %g1,%o0,%g0
        bz,a    1f
        fdivs   FONE,%f14,%f14          ! x0 = DONE / x0;

        cmp     %g1,0
        bl,a    1f
        fsqrts  %f14,%f14               ! x0 = sqrtf(x0);

        fitod   %f14,%f0
        fdtos   %f0,%f14
        fmuls   %f14,FTWO,%f14
        st      %f14,[%fp+tmp3]
        ld      [%fp+tmp3],%g1
        sethi   %hi(0x4b000000),%o0
        sra     %g1,13,%l5              ! (4_0) si0 = ax0 >> 13;
        fands   %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
        ba      .cont_spec
        sub     %g1,%o0,%g1
1:
        add     %l7,stridex,%l7         ! px += stridex
        sub     counter,1,counter
        st      %f14,[%i2]              ! *py = x0;
        ba      .begin1
        add     %i2,stridey,%i2         ! py += stridey

        .align  16
.update0:
        cmp     counter,1
        ble     .cont0
        nop

        sub     %i1,stridex,%o1
        stx     %o1,[%fp+tmp_px]

        sub     counter,1,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont0
        mov     1,counter

        .align  16
.update1:
        sethi   %hi(0x7ffffc00),%o0
        cmp     counter,1
        ble     .cont1

        add     %o0,0x3ff,%o0

        andcc   %g5,%o0,%g0
        bz,a    1f
        nop

        cmp     %g5,0
        bl,a    1f
        nop

        fitod   %f15,%f0
        fdtos   %f0,%f15
        fmuls   %f15,FTWO,%f15
        st      %f15,[%fp+tmp3]
        ld      [%fp+tmp3],%g5
        sethi   %hi(0x4b000000),%o0
        sub     %g5,%o0,%g5

        fands   %f15,DC0,%f17           ! (4_0) dfx0 = vis_fand(ddx0,DC0);

        sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;

        sra     %g5,24,%l7              ! (5_0) iexp1 = ax1 >> 24;
        and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;

        fpsub32s        %f15,%f17,%f17  ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sub     %l0,%l7,%l1             ! (5_0) iexp1 = 0x3f - iexp1;

        sll     %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
        add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;
        st      %l1,[%fp+tmp0+4]        ! (4_0) fdx0 = *((double*)lexp0);
        fitod   %f17,%f44               ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);

        fmuld   %f44,%f46,%f46          ! (5_1) xx1 = dtmp1 * tbl_div1;

        ba      .cont1
        fmuld   K3,%f46,%f50            ! (5_1) res1 = K3 * xx1;
1:
        sub     %i1,stridex,%o1
        stx     %o1,[%fp+tmp_px]

        sub     counter,1,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont1
        mov     1,counter

        .align  16
.update2:
        cmp     counter,2
        ble     .cont2
        sub     %o5,stridex,%o1

        sub     %o1,stridex,%o1
        stx     %o1,[%fp+tmp_px]

        sub     counter,2,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont2
        mov     2,counter

        .align  16
.update3:
        sethi   %hi(0x7ffffc00),%o1
        cmp     counter,2
        ble     .cont3

        add     %o1,0x3ff,%o1

        andcc   %g1,%o1,%g0
        bz,a    1f
        sub     %o5,stridex,%o1

        cmp     %g1,0
        bl,a    1f
        sub     %o5,stridex,%o1

        fitod   %f18,%f0
        fdtos   %f0,%f18
        fmuls   %f18,FTWO,%f18
        st      %f18,[%fp+tmp3]
        ld      [%fp+tmp3],%g1
        sethi   %hi(0x4b000000),%o1
        sub     %g1,%o1,%g1

        fand    %f18,DC0,%f56           ! (0_0) dfx0 = vis_fand(ddx0,DC0);
        sra     %g1,13,%o0              ! (0_0) si0 = ax0 >> 13;

        and     %o0,2032,%o0            ! (0_0) si0 &= 0x7f0;

        ldd     [%o0+TBL],%f54          ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        fpsub32 %f18,%f56,%f30          ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        sra     %g1,24,%i3              ! (0_0) iexp0 = ax0 >> 24;
        sub     %l0,%i3,%g5             ! (0_0) iexp0 = 0x3f - iexp0;
        ba      .cont3
        fitod   %f30,%f56               ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
1:
        sub     %o1,stridex,%o1
        stx     %o1,[%fp+tmp_px]

        sub     counter,2,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont3
        mov     2,counter

        .align  16
.update4:
        cmp     counter,3
        ble     .cont4
        sub     %l7,stridex2,%o1

        sub     %o1,stridex,%o1
        stx     %o1,[%fp+tmp_px]

        sub     counter,3,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont4
        mov     3,counter

        .align  16
.update5:
        sethi   %hi(0x7ffffc00),%o1
        cmp     counter,3
        ble     .cont5

        add     %o1,0x3ff,%o1

        andcc   %i4,%o1,%g0
        bz,a    1f
        sub     %l7,stridex2,%o1

        cmp     %i4,0
        bl,a    1f
        sub     %l7,stridex2,%o1

        fitod   %f19,%f0
        fdtos   %f0,%f19
        fmuls   %f19,FTWO,%f19
        st      %f19,[%fp+tmp3]
        ld      [%fp+tmp3],%i4
        sethi   %hi(0x4b000000),%o1
        sub     %i4,%o1,%i4

        fands   %f19,DC0,%f0            ! (0_0) dfx0 = vis_fand(ddx0,DC0);

        sra     %i4,13,%g5              ! (1_0) si1 = ax1 >> 13;

        sra     %i4,24,%i1              ! (1_0) iexp1 = ax1 >> 24;
        and     %g5,2032,%o7            ! (1_0) si1 &= 0x7f0;
        fpsub32s        %f19,%f0,%f31   ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%o7+TBL],%f44          ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sub     %l0,%i1,%i0             ! (1_0) iexp1 = 0x3f - iexp1;

        sll     %i0,23,%i0              ! (1_0) lexp1 = iexp1 << 23;
        fitod   %f31,%f50               ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);

        st      %i0,[%fp+tmp1+4]        ! (0_0) fdx0 = *((double*)lexp0);

        add     %o7,TBL,%o7             ! (1_0) addr0 = (char*)TBL + si0;
        fmuld   %f50,%f44,%f44          ! (1_0) xx0 = dtmp0 * tbl_div0;

        ba      .cont5
        fmuld   K3,%f44,%f50            ! (1_0) res1 = K3 * xx1;
1:
        sub     %o1,stridex,%o1
        stx     %o1,[%fp+tmp_px]

        sub     counter,3,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont5
        mov     3,counter

        .align  16
.update6:
        cmp     counter,4
        ble     .cont6
        sub     %l7,stridex,%o3

        sub     %o3,stridex,%o3
        stx     %o3,[%fp+tmp_px]

        sub     counter,4,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont6
        mov     4,counter

        .align  16
.update7:
        sethi   %hi(0x7ffffc00),%o3
        cmp     counter,4
        ble     .cont7

        add     %o3,0x3ff,%o3

        andcc   %g1,%o3,%g0
        bz,a    1f
        sub     %l7,stridex,%o3

        cmp     %g1,0
        bl,a    1f
        sub     %l7,stridex,%o3

        fitod   %f24,%f0
        fdtos   %f0,%f24
        fmuls   %f24,FTWO,%f24
        st      %f24,[%fp+tmp3]
        ld      [%fp+tmp3],%g1
        sethi   %hi(0x4b000000),%o3
        sub     %g1,%o3,%g1

        fands   %f24,DC0,%f0            ! (2_0) dfx0 = vis_fand(ddx0,DC0);
        sra     %g1,13,%i0              ! (2_0) si0 = ax0 >> 13;

        and     %i0,2032,%i0            ! (2_0) si0 &= 0x7f0;

        ldd     [%i0+TBL],%f30          ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        fpsub32s        %f24,%f0,%f12   ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        sra     %g1,24,%i3              ! (2_0) iexp0 = ax0 >> 24;

        sub     %l0,%i3,%g5             ! (2_0) iexp0 = 0x3f - iexp0;

        sll     %g5,23,%g5              ! (2_0) lexp0 = iexp0 << 55;
        add     %i0,TBL,%i0             ! (2_0) addr0 = (char*)TBL + si0;
        fitod   %f12,%f56               ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);

        st      %g5,[%fp+tmp2]          ! (2_0) fdx0 = *((double*)lexp0);
        ba      .cont7
        fmuld   %f56,%f30,%f30          ! (2_0) xx0 = dtmp0 * tbl_div0;
1:
        sub     %o3,stridex,%o3
        stx     %o3,[%fp+tmp_px]

        sub     counter,4,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont7
        mov     4,counter

        .align  16
.update8:
        cmp     counter,5
        ble     .cont8
        nop

        sub     %l7,stridex,%o3
        stx     %o3,[%fp+tmp_px]

        sub     counter,5,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont8
        mov     5,counter

        .align  16
.update9:
        sethi   %hi(0x7ffffc00),%o3
        cmp     counter,5
        ble     .cont9
        sub     %l7,stridex,%i3

        add     %o3,0x3ff,%o3

        andcc   %o5,%o3,%g0
        bz      1f
        ld      [%i3],%f0

        cmp     %o5,0
        bl,a    1f
        nop

        fitod   %f0,%f0
        fdtos   %f0,%f0
        fmuls   %f0,FTWO,%f0
        st      %f0,[%fp+tmp3]
        ld      [%fp+tmp3],%o5
        sethi   %hi(0x4b000000),%o3
        sub     %o5,%o3,%o5

        fands   %f0,DC0,%f8             ! (2_0) dfx0 = vis_fand(ddx0,DC0);

        sra     %o5,13,%o1              ! (3_0) si1 = ax1 >> 13;

        sra     %o5,24,%o3              ! (3_0) iexp1 = ax1 >> 24;
        and     %o1,2032,%o1            ! (3_0) si1 &= 0x7f0;
        fpsub32s        %f0,%f8,%f0     ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%o1+TBL],%f8           ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sub     %l0,%o3,%i3             ! (3_0) iexp1 = 0x3f - iexp1;

        sllx    %i3,23,%i3              ! (3_0) lexp1 = iexp1 << 23;
        fitod   %f0,%f50                ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);

        add     %o1,TBL,%o1             ! (3_0) addr1 = (char*)TBL + si1;
        st      %i3,[%fp+tmp2+4]        ! (2_0) fdx0 = *((double*)lexp0);

        fmuld   %f50,%f8,%f24           ! (3_0) xx1 = dtmp1 * tbl_div1;

        ba      .cont9
        fmuld   K3,%f24,%f50            ! (3_0) res1 = K3 * xx1;
1:
        stx     %i3,[%fp+tmp_px]

        sub     counter,5,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont9
        mov     5,counter

        .align  16
.update10:
        cmp     counter,0
        ble     .cont10
        sub     %i1,stridex,%o3

        sub     %o3,stridex,%o3
        stx     %o3,[%fp+tmp_px]

        st      counter,[%fp+tmp_counter]

        ba      .cont10
        mov     0,counter

        .align  16
.update11:
        sethi   %hi(0x7ffffc00),%i4
        cmp     counter,0
        ble     .cont11
        sub     %i1,stridex,%o3

        sub     %o3,stridex,%o3
        add     %i4,0x3ff,%i4
        ld      [%o3],%i3

        andcc   %i3,%i4,%g0
        bz      1f

        cmp     %i3,0
        bl,a    1f
        nop

        fitod   %f14,%f0
        fdtos   %f0,%f14
        fmuls   %f14,FTWO,%f14
        st      %f14,[%fp+tmp3]
        ld      [%fp+tmp3],%i3
        sethi   %hi(0x4b000000),%o3
        sub     %i3,%o3,%i3

        fands   %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
        sra     %i3,13,%l5              ! (4_0) si0 = ax0 >> 13;

        and     %l5,2032,%l5            ! (4_0) si0 &= 0x7f0;

        ldd     [%l5+TBL],%f54          ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        fpsub32s        %f14,%f16,%f16  ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        sra     %i3,24,%i3              ! (4_0) iexp0 = ax0 >> 24;

        sub     %l0,%i3,%o0             ! (4_0) iexp0 = 0x3f - iexp0;
        fitod   %f16,%f56               ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);

        sllx    %o0,23,%o0              ! (4_0) lexp0 = iexp0 << 55;

        st      %o0,[%fp+tmp0]          ! (4_0) fdx0 = *((double*)lexp0);

        ba      .cont11
        fmuld   %f56,%f54,%f40          ! (4_0) xx0 = dtmp0 * tbl_div0;
1:
        stx     %o3,[%fp+tmp_px]

        st      counter,[%fp+tmp_counter]

        ba      .cont11
        mov     0,counter

        .align  16
.update12:
        cmp     counter,1
        ble     .cont12
        nop

        sub     %i1,stridex,%i1
        stx     %i1,[%fp+tmp_px]

        sub     counter,1,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont12
        mov     1,counter

        .align  16
.update13:
        sethi   %hi(0x7ffffc00),%o3
        cmp     counter,1
        ble     .cont13

        add     %o3,0x3ff,%o3

        andcc   %g5,%o3,%g0
        bz      1f

        cmp     %g5,0
        bl,a    1f
        nop

        fitod   %f15,%f0
        fdtos   %f0,%f15
        fmuls   %f15,FTWO,%f15
        st      %f15,[%fp+tmp3]
        ld      [%fp+tmp3],%g5
        sethi   %hi(0x4b000000),%o3
        sub     %g5,%o3,%g5

        fands   %f15,DC0,%f17           ! (4_0) dfx0 = vis_fand(ddx0,DC0);

        sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;
        sra     %g5,24,%o3              ! (5_0) iexp1 = ax1 >> 24;
        and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;
        fpsub32s        %f15,%f17,%f17  ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sub     %l0,%o3,%l1             ! (5_0) iexp1 = 0x3f - iexp1;

        add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;

        sllx    %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
        st      %l1,[%fp+tmp0+4]        ! (4_0) fdx0 = *((double*)lexp0);

        fitod   %f17,%f0                ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);

        fmuld   %f0,%f46,%f46           ! (5_1) xx1 = dtmp1 * tbl_div1;
        ba      .cont13
        fmuld   K3,%f46,%f50            ! (5_1) res1 = K3 * xx1;
1:
        sub     %i1,stridex,%i1
        stx     %i1,[%fp+tmp_px]

        sub     counter,1,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont13
        mov     1,counter

        .align  16
.update14:
        cmp     counter,2
        ble     .cont14
        sub     %o5,stridex,%o3

        sub     %o3,stridex,%o3
        stx     %o3,[%fp+tmp_px]

        sub     counter,2,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont14
        mov     2,counter

        .align  16
.update15:
        sethi   %hi(0x7ffffc00),%i3
        cmp     counter,2
        ble     .cont15
        sub     %o5,stridex,%o3

        add     %i3,0x3ff,%i3

        andcc   %g1,%i3,%g0
        bz      1f
        sub     %o3,stridex,%o3

        cmp     %g1,0
        bl,a    1f
        nop

        fitod   %f18,%f0
        fdtos   %f0,%f18
        fmuls   %f18,FTWO,%f18
        st      %f18,[%fp+tmp3]
        ld      [%fp+tmp3],%g1
        sethi   %hi(0x4b000000),%o3
        sub     %g1,%o3,%g1

        fands   %f18,DC0,%f0            ! (0_0) dfx0 = vis_fand(ddx0,DC0);
        sra     %g1,13,%o0              ! (0_0) si0 = ax0 >> 13;
        and     %o0,2032,%o0            ! (0_0) si0 &= 0x7f0;

        ldd     [%o0+TBL],%f54          ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        fpsub32s        %f18,%f0,%f30   ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        sra     %g1,24,%i3              ! (0_0) iexp0 = ax0 >> 24;

        sub     %l0,%i3,%g5             ! (0_0) iexp0 = 0x3f - iexp0;

        ba      .cont15
        fitod   %f30,%f56               ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
1:
        stx     %o3,[%fp+tmp_px]

        sub     counter,2,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont15
        mov     2,counter

        .align  16
.update16:
        cmp     counter,3
        ble     .cont16
        sub     %l7,stridex2,%o3

        sub     %o3,stridex,%o3
        stx     %o3,[%fp+tmp_px]

        sub     counter,3,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont16
        mov     3,counter

        .align  16
.update17:
        sethi   %hi(0x7ffffc00),%i3
        cmp     counter,3
        ble     .cont17
        sub     %l7,stridex2,%o3

        add     %i3,0x3ff,%i3

        andcc   %i4,%i3,%g0
        bz      1f
        sub     %o3,stridex,%o3

        cmp     %i4,0
        bl,a    1f
        nop

        fitod   %f19,%f0
        fdtos   %f0,%f19
        fmuls   %f19,FTWO,%f19
        st      %f19,[%fp+tmp3]
        ld      [%fp+tmp3],%i4
        sethi   %hi(0x4b000000),%o3
        sub     %i4,%o3,%i4

        fands   %f19,DC0,%f0            ! (0_0) dfx0 = vis_fand(ddx0,DC0);

        sra     %i4,13,%g5              ! (1_0) si1 = ax1 >> 13;

        sra     %i4,24,%i0              ! (1_0) iexp1 = ax1 >> 24;
        and     %g5,2032,%o7            ! (1_0) si1 &= 0x7f0;
        fpsub32s        %f19,%f0,%f31   ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%o7+TBL],%f44          ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sub     %l0,%i0,%i0             ! (1_0) iexp1 = 0x3f - iexp1;

        sllx    %i0,23,%i0              ! (1_0) lexp1 = iexp1 << 23;
        fitod   %f31,%f50               ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);

        st      %i0,[%fp+tmp1+4]        ! (0_0) fdx0 = *((double*)lexp0);

        add     %o7,TBL,%o7             ! (1_0) addr0 = (char*)TBL + si0;
        fmuld   %f50,%f44,%f44          ! (1_0) xx0 = dtmp0 * tbl_div0;

        ba      .cont17
        fmuld   K3,%f44,%f50            ! (1_0) res1 = K3 * xx1;
1:
        stx     %o3,[%fp+tmp_px]

        sub     counter,3,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont17
        mov     3,counter

        .align  16
.update18:
        cmp     counter,4
        ble     .cont18
        fpadd32 %f20,%f52,%f0           ! (2_1) dres0 = vis_fpadd32(dres0,fdx0);

        sub     %l7,stridex2,%i3
        stx     %i3,[%fp+tmp_px]

        sub     counter,4,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont18
        mov     4,counter

        .align  16
.update19:
        sethi   %hi(0x7ffffc00),%i3
        cmp     counter,4
        ble,a   .cont19
        fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;

        add     %i3,0x3ff,%i3

        andcc   %g1,%i3,%g0
        bz      1f
        nop

        cmp     %g1,0
        bl,a    1f
        nop

        fitod   %f24,%f24
        fdtos   %f24,%f24
        fmuls   %f24,FTWO,%f24
        st      %f24,[%fp+tmp3]
        ld      [%fp+tmp3],%g1
        sethi   %hi(0x4b000000),%i3
        sub     %g1,%i3,%g1

        fands   %f24,DC0,%f8            ! (2_0) dfx0 = vis_fand(ddx0,DC0);
        sra     %g1,13,%i0              ! (2_0) si0 = ax0 >> 13;

        and     %i0,2032,%i0            ! (2_0) si0 &= 0x7f0;

        ldd     [%i0+TBL],%f30          ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
        fpsub32s        %f24,%f8,%f12   ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        sra     %g1,24,%i3              ! (2_0) iexp0 = ax0 >> 24;

        sub     %l0,%i3,%g5             ! (2_0) iexp0 = 0x3f - iexp0;

        sllx    %g5,23,%g5              ! (2_0) lexp0 = iexp0 << 55;
        add     %i0,TBL,%i0             ! (2_0) addr0 = (char*)TBL + si0;
        fitod   %f12,%f56               ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);

        st      %g5,[%fp+tmp2]          ! (2_0) fdx0 = *((double*)lexp0);
        fmuld   %f56,%f30,%f30          ! (2_0) xx0 = dtmp0 * tbl_div0;

        ba      .cont19
        fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;
1:
        sub     %l7,stridex2,%i3
        stx     %i3,[%fp+tmp_px]

        sub     counter,4,counter
        st      counter,[%fp+tmp_counter]

        mov     4,counter
        ba      .cont19
        fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;

        .align  16
.update20:
        cmp     counter,5
        ble     .cont20
        nop

        sub     %l7,stridex,%i3
        stx     %i3,[%fp+tmp_px]

        sub     counter,5,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont20
        mov     5,counter

        .align  16
.update21:
        sethi   %hi(0x7ffffc00),%i3
        cmp     counter,5
        ble,a   .cont21
        nop

        sub     %l7,stridex,%i4
        add     %i3,0x3ff,%i3

        andcc   %o5,%i3,%g0
        bz      1f
        ld      [%i4],%f8

        cmp     %o5,0
        bl,a    1f
        nop

        fitod   %f8,%f8
        fdtos   %f8,%f8
        fmuls   %f8,FTWO,%f8
        st      %f8,[%fp+tmp3]
        ld      [%fp+tmp3],%o5
        sethi   %hi(0x4b000000),%i3
        sub     %o5,%i3,%o5

        fands   %f8,DC0,%f24            ! (2_0) dfx0 = vis_fand(ddx0,DC0);

        sra     %o5,13,%o1              ! (3_0) si1 = ax1 >> 13;

        sra     %o5,24,%i3              ! (3_0) iexp1 = ax1 >> 24;
        and     %o1,2032,%o1            ! (3_0) si1 &= 0x7f0;
        fpsub32s        %f8,%f24,%f24   ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);

        ldd     [%o1+TBL],%f8           ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
        sub     %l0,%i3,%i3             ! (3_0) iexp1 = 0x3f - iexp1;

        sllx    %i3,23,%i3              ! (3_0) lexp1 = iexp1 << 23;
        fitod   %f24,%f50               ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);

        add     %o1,TBL,%o1             ! (3_0) addr1 = (char*)TBL + si1;
        st      %i3,[%fp+tmp2+4]        ! (2_0) fdx0 = *((double*)lexp0);

        fmuld   %f50,%f8,%f24           ! (3_0) xx1 = dtmp1 * tbl_div1;

        ba      .cont21
        fmuld   K3,%f24,%f50            ! (3_0) res1 = K3 * xx1;
1:
        sub     %l7,stridex,%i3
        stx     %i3,[%fp+tmp_px]

        sub     counter,5,counter
        st      counter,[%fp+tmp_counter]

        ba      .cont21
        mov     5,counter

        .align  16
.exit:
        ret
        restore

        SET_SIZE(__vrsqrtf)