root/usr/src/lib/libmvec/common/vis/__vrsqrt.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

        .file   "__vrsqrt.S"

#include "libm.h"

        RO_DATA
        .align  64

.CONST_TBL:
        .word   0xbfe00000, 0x0000002f  ! K1 =-5.00000000000005209867e-01;
        .word   0x3fd80000, 0x00000058  ! K2 = 3.75000000000004884257e-01;
        .word   0xbfd3ffff, 0xff444bc8  ! K3 =-3.12499999317136886551e-01;
        .word   0x3fd17fff, 0xff5006fe  ! K4 = 2.73437499359815081532e-01;
        .word   0xbfcf80bb, 0xb33ef574  ! K5 =-2.46116125605037803130e-01;
        .word   0x3fcce0af, 0xf8156949  ! K6 = 2.25606914648617522896e-01;

        .word   0x001fffff, 0xffffffff  ! DC0
        .word   0x3fe00000, 0x00000000  ! DC1
        .word   0x00002000, 0x00000000  ! DC2
        .word   0x7fffc000, 0x00000000  ! DC3
        .word   0x0007ffff, 0xffffffff  ! DC4

        .word   0x43200000, 0x00000000  ! D2ON51  = pow(2,51)
        .word   0x3ff00000, 0x00000000  ! DONE   = 1.0

#define stridex         %l5
#define stridey         %l7
#define counter         %l0
#define TBL             %l3
#define _0x7ff00000     %o0
#define _0x00100000     %o1

#define DC0             %f56
#define DC1             %f54
#define DC2             %f48
#define DC3             %f46
#define K6              %f42
#define K5              %f20
#define K4              %f52
#define K3              %f50
#define K2              %f14
#define K1              %f12
#define DONE            %f4

#define tmp_counter     %g5
#define tmp_px          %o5

#define tmp0            STACK_BIAS-0x40
#define tmp1            STACK_BIAS-0x38
#define tmp2            STACK_BIAS-0x30
#define tmp3            STACK_BIAS-0x28
#define tmp4            STACK_BIAS-0x20
#define tmp5            STACK_BIAS-0x18
#define tmp6            STACK_BIAS-0x10
#define tmp7            STACK_BIAS-0x08

! sizeof temp storage - must be a multiple of 16 for V9
#define tmps            0x40

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!      !!!!!   algorithm   !!!!!
!  ((float*)&res)[0] = ((float*)px)[0];
!  ((float*)&res)[1] = ((float*)px)[1];
!  hx = *(int*)px;
!  if ( hx >= 0x7ff00000 )
!  {
!    res = DONE / res;
!    ((float*)py)[0] = ((float*)&res)[0];
!    ((float*)py)[1] = ((float*)&res)[1];
!    px += stridex;
!    py += stridey;
!    continue;
!  }
!  if ( hx < 0x00100000 )
!  {
!    ax = hx & 0x7fffffff;
!    lx = ((int*)px)[1];
!
!    if ( (ax | lx) == 0 )
!    {
!      res = DONE / res;
!      ((float*)py)[0] = ((float*)&res)[0];
!      ((float*)py)[1] = ((float*)&res)[1];
!      px += stridex;
!      py += stridey;
!      continue;
!    }
!    else if ( hx >= 0 )
!    {
!      if ( hx < 0x00080000 )
!      {
!        res = *(long long*)&res;
!        hx = *(int*)&res - (537 << 21);
!      }
!      else
!      {
!        res = vis_fand(res,DC4);
!        res = *(long long*)&res;
!        res += D2ON51;
!        hx = *(int*)&res - (537 << 21);
!      }
!    }
!    else
!    {
!      res = sqrt(res);
!      ((float*)py)[0] = ((float*)&res)[0];
!      ((float*)py)[1] = ((float*)&res)[1];
!      px += stridex;
!      py += stridey;
!      continue;
!    }
!  }
!
!  iexp = hx >> 21;
!  iexp = -iexp;
!  iexp += 0x5fe;
!  lexp = iexp << 52;
!  dlexp = *(double*)&lexp;
!  hx >>= 10;
!  hx &= 0x7f8;
!  hx += 8;
!  hx &= -16;
!
!  res = vis_fand(res,DC0);
!  res = vis_for(res,DC1);
!  res_c = vis_fpadd32(res,DC2);
!  res_c = vis_fand(res_c,DC3);
!
!  addr = (char*)arr + hx;
!  dexp_hi = ((double*)addr)[0];
!  dexp_lo = ((double*)addr)[1];
!  dtmp0 = dexp_hi * dexp_hi;
!  xx = res - res_c;
!  xx *= dtmp0;
!  res = K6 * xx;
!  res += K5;
!  res *= xx;
!  res += K4;
!  res *= xx;
!  res += K3;
!  res *= xx;
!  res += K2;
!  res *= xx;
!  res += K1;
!  res *= xx;
!  res = dexp_hi * res;
!  res += dexp_lo;
!  res += dexp_hi;
!
!  res *= dlexp;
!
!  ((float*)py)[0] = ((float*)&res)[0];
!  ((float*)py)[1] = ((float*)&res)[1];
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        ENTRY(__vrsqrt)
        save    %sp,-SA(MINFRAME)-tmps,%sp
        PIC_SETUP(l7)
        PIC_SET(l7,.CONST_TBL,o3)
        PIC_SET(l7,__vlibm_TBL_rsqrt,l3)
        wr      %g0,0x82,%asi

        ldd     [%o3],K1
        sethi   %hi(0x7ff00000),%o0
        mov     %i3,%o4

        ldd     [%o3+0x08],K2
        sethi   %hi(0x00100000),%o1
        mov     %i1,tmp_px

        ldd     [%o3+0x10],K3
        sll     %i2,3,stridex
        mov     %i0,tmp_counter

        ldd     [%o3+0x18],K4
        sll     %i4,3,stridey

        ldd     [%o3+0x20],K5
        ldd     [%o3+0x28],K6
        ldd     [%o3+0x30],DC0
        ldd     [%o3+0x38],DC1
        ldd     [%o3+0x40],DC2
        ldd     [%o3+0x48],DC3

.begin:
        mov     tmp_counter,counter
        mov     tmp_px,%i1
        clr     tmp_counter
.begin1:
        cmp     counter,0
        ble,pn  %icc,.exit
        ldd     [%o3+0x60],DONE

        lda     [%i1]%asi,%f0           ! (6_0) ((float*)res)[0] = ((float*)px)[0];
        sethi   %hi(0x7ffffc00),%i0

        lda     [%i1+4]%asi,%f1         ! (6_0) ((float*)res)[1] = ((float*)px)[1];
        add     %i0,1023,%i0

        fand    %f0,DC0,%f16            ! (6_0) res = vis_fand(res,DC0);

        lda     [%i1]%asi,%g1           ! (6_1) hx = *(int*)px;
        sethi   %hi(0x00080000),%i4

        lda     [%i1+4]%asi,%l4
        add     %i1,stridex,%l6         ! px += stridex

        sra     %g1,21,%o7              ! (6_1) iexp = hx >> 21;
        lda     [%l6]%asi,%f8           ! (0_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f44           ! (6_1) res = vis_for(res,DC1);

        lda     [%l6+4]%asi,%f9         ! (0_0) ((float*)res)[1] = ((float*)px)[1];
        sra     %g1,10,%o2              ! (6_1) hx >>= 10;
        and     %g1,%i0,%i2

        cmp     %g1,_0x7ff00000         ! (6_1) hx ? 0x7ff00000
        bge,pn  %icc,.spec0             ! (6_1) if ( hx >= 0x7ff00000 )
        and     %o2,2040,%o2            ! (6_1) hx &= 0x7f8;

        cmp     %g1,_0x00100000         ! (6_1) hx ? 0x00100000
        bl,pn   %icc,.spec1             ! (6_1) if ( hx < 0x00100000 )
        sub     %g0,%o7,%o7             ! (6_1) iexp = -iexp;
.cont_spec:
        fand    %f8,DC0,%f16            ! (0_0) res = vis_fand(res,DC0);

        fpadd32 %f44,DC2,%f18           ! (6_1) res_c = vis_fpadd32(res,DC2);

        add     %o2,8,%l4               ! (6_1) hx += 8;

        add     %o7,1534,%o7            ! (6_1) iexp += 0x5fe;

        lda     [%l6]%asi,%g1           ! (0_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (6_1) iexp << 52;
        and     %l4,-16,%l4             ! (6_1) hx = -16;

        add     %l4,TBL,%l4             ! (6_1) addr = (char*)arr + hx;
        stx     %o7,[%fp+tmp1]          ! (6_1) dlexp = *(double*)lexp;

        add     %l6,stridex,%l6         ! px += stridex
        ldd     [%l4],%f30              ! (6_1) dtmp0 = ((double*)addr)[0];

        sra     %g1,21,%o7              ! (0_0) iexp = hx >> 21;
        lda     [%l6]%asi,%f0           ! (1_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f28           ! (0_0) res = vis_for(res,DC1);

        sra     %g1,10,%o2              ! (0_0) hx >>= 10;
        sub     %g0,%o7,%o7             ! (0_0) iexp = -iexp;
        lda     [%l6+4]%asi,%f1         ! (1_0) ((float*)res)[1] = ((float*)px)[1];

        cmp     %g1,_0x7ff00000         ! (0_0) hx ? 0x7ff00000
        bge,pn  %icc,.update0           ! (0_0) if ( hx >= 0x7ff00000 )
        fand    %f18,DC3,%f6            ! (6_1) res_c = vis_fand(res_c,DC3);
.cont0:
        and     %o2,2040,%o2            ! (0_0) hx &= 0x7f8;
        fmuld   %f30,%f30,%f10          ! (6_1) dtmp0 = dexp_hi * dexp_hi;

        cmp     %g1,_0x00100000         ! (0_0) hx ? 0x00100000
        bl,pn   %icc,.update1           ! (0_0) if ( hx < 0x00100000 )
        add     %o7,1534,%o7            ! (0_0) iexp += 0x5fe;
.cont1:
        fand    %f0,DC0,%f16            ! (1_0) res = vis_fand(res,DC0);

        fpadd32 %f28,DC2,%f18           ! (0_0) res_c = vis_fpadd32(res,DC2);

        add     %o2,8,%l2               ! (0_0) hx += 8;
        fsubd   %f44,%f6,%f6            ! (6_1) xx = res - res_c;

        lda     [%l6]%asi,%g1           ! (1_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (0_0) iexp << 52;
        and     %l2,-16,%l2             ! (0_0) hx = -16;

        add     %l2,TBL,%l2             ! (0_0) addr = (char*)arr + hx;
        add     %l6,stridex,%l6         ! px += stridex
        stx     %o7,[%fp+tmp2]          ! (0_0) dlexp = *(double*)lexp;

        fmuld   %f6,%f10,%f26           ! (6_1) xx *= dtmp0;
        ldd     [%l2],%f10              ! (0_0) dtmp0 = ((double*)addr)[0];

        sra     %g1,21,%o7              ! (1_0) iexp = hx >> 21;
        lda     [%l6]%asi,%f6           ! (2_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f44           ! (1_0) res = vis_for(res,DC1);

        sra     %g1,10,%o2              ! (1_0) hx >>= 10;
        cmp     %g1,_0x7ff00000         ! (1_0) hx ? 0x7ff00000
        bge,pn  %icc,.update2           ! (1_0) if ( hx >= 0x7ff00000 )
        lda     [%l6+4]%asi,%f7         ! (2_0) ((float*)res)[1] = ((float*)px)[1];
.cont2:
        fand    %f18,DC3,%f8            ! (0_0) res_c = vis_fand(res_c,DC3);

        fmuld   %f10,%f10,%f10          ! (0_0) dtmp0 = dexp_hi * dexp_hi;
        cmp     %g1,_0x00100000         ! (1_0) hx ? 0x00100000
        bl,pn   %icc,.update3           ! (1_0) if ( hx < 0x00100000 )
        and     %o2,2040,%o2            ! (1_0) hx &= 0x7f8;
.cont3:
        sub     %g0,%o7,%o7             ! (1_0) iexp = -iexp;
        fand    %f6,DC0,%f16            ! (2_0) res = vis_fand(res,DC0);

        add     %o7,1534,%o7            ! (1_0) iexp += 0x5fe;
        fpadd32 %f44,DC2,%f18           ! (1_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f26,%f62            ! (6_1) res = K6 * xx;
        add     %o2,8,%i2               ! (1_0) hx += 8;
        fsubd   %f28,%f8,%f32           ! (0_0) xx = res - res_c;

        lda     [%l6]%asi,%g1           ! (2_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (1_0) iexp << 52;
        and     %i2,-16,%i2             ! (1_0) hx = -16;

        add     %i2,TBL,%i2             ! (1_0) addr = (char*)arr + hx;
        stx     %o7,[%fp+tmp3]          ! (1_0) dlexp = *(double*)lexp;

        fmuld   %f32,%f10,%f32          ! (0_0) xx *= dtmp0;
        add     %l6,stridex,%l6         ! px += stridex
        ldd     [%i2],%f10              ! (1_0) dtmp0 = ((double*)addr)[0];
        faddd   %f62,K5,%f62            ! (6_1) res += K5;

        sra     %g1,21,%o7              ! (2_0) iexp = hx >> 21;
        lda     [%l6]%asi,%f0           ! (3_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f28           ! (2_0) res = vis_for(res,DC1);

        sra     %g1,10,%o2              ! (2_0) hx >>= 10;
        cmp     %g1,_0x7ff00000         ! (2_0) hx ? 0x7ff00000
        bge,pn  %icc,.update4           ! (2_0) if ( hx >= 0x7ff00000 )
        lda     [%l6+4]%asi,%f1         ! (3_0) ((float*)res)[1] = ((float*)px)[1];
.cont4:
        fmuld   %f62,%f26,%f40          ! (6_1) res *= xx;
        fand    %f18,DC3,%f8            ! (1_0) res_c = vis_fand(res_c,DC3);

        fmuld   %f10,%f10,%f10          ! (1_0) dtmp0 = dexp_hi * dexp_hi;
        cmp     %g1,_0x00100000         ! (2_0) hx ? 0x00100000
        bl,pn   %icc,.update5           ! (2_0) if ( hx < 0x00100000 )
        and     %o2,2040,%o2            ! (2_0) hx &= 0x7f8;
.cont5:
        sub     %g0,%o7,%o7             ! (2_0) iexp = -iexp;
        fand    %f0,DC0,%f16            ! (3_0) res = vis_fand(res,DC0);

        add     %o7,1534,%o7            ! (2_0) iexp += 0x5fe;
        fpadd32 %f28,DC2,%f18           ! (2_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f32,%f62            ! (0_0) res = K6 * xx;
        add     %o2,8,%i4               ! (2_0) hx += 8;
        fsubd   %f44,%f8,%f6            ! (1_0) xx = res - res_c;

        faddd   %f40,K4,%f40            ! (6_1) res += K4;

        lda     [%l6]%asi,%g1           ! (3_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (2_0) iexp << 52;
        and     %i4,-16,%i4             ! (2_0) hx = -16;

        add     %i4,TBL,%i4             ! (2_0) addr = (char*)arr + hx;
        stx     %o7,[%fp+tmp4]          ! (2_0) dlexp = *(double*)lexp;

        fmuld   %f6,%f10,%f38           ! (1_0) xx *= dtmp0;
        ldd     [%i4],%f24              ! (2_0) dtmp0 = ((double*)addr)[0];
        faddd   %f62,K5,%f62            ! (0_0) res += K5;

        fmuld   %f40,%f26,%f34          ! (6_1) res *= xx;
        add     %l6,stridex,%l6         ! px += stridex

        sra     %g1,21,%o7              ! (3_0) iexp = hx >> 21;
        lda     [%l6]%asi,%f8           ! (4_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f44           ! (3_0) res = vis_for(res,DC1);

        sra     %g1,10,%o2              ! (3_0) hx >>= 10;
        cmp     %g1,_0x7ff00000         ! (3_0) hx ? 0x7ff00000
        bge,pn  %icc,.update6           ! (3_0) if ( hx >= 0x7ff00000 )
        lda     [%l6+4]%asi,%f9         ! (4_0) ((float*)res)[1] = ((float*)px)[1];
.cont6:
        fmuld   %f62,%f32,%f60          ! (0_0) res *= xx;
        cmp     %g1,_0x00100000         ! (3_0) hx ? 0x00100000
        fand    %f18,DC3,%f22           ! (2_0) res_c = vis_fand(res_c,DC3);

        fmuld   %f24,%f24,%f24          ! (2_0) dtmp0 = dexp_hi * dexp_hi;
        bl,pn   %icc,.update7           ! (3_0) if ( hx < 0x00100000 )
        and     %o2,2040,%o2            ! (3_0) hx &= 0x7f8;
        faddd   %f34,K3,%f6             ! (6_1) res += K3;
.cont7:
        sub     %g0,%o7,%o7             ! (3_0) iexp = -iexp;
        fand    %f8,DC0,%f16            ! (4_0) res = vis_fand(res,DC0);

        add     %o7,1534,%o7            ! (3_0) iexp += 0x5fe;
        fpadd32 %f44,DC2,%f18           ! (3_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f38,%f62            ! (1_0) res = K6 * xx;
        add     %o2,8,%i5               ! (3_0) hx += 8;
        fsubd   %f28,%f22,%f28          ! (2_0) xx = res - res_c;

        fmuld   %f6,%f26,%f22           ! (6_1) res *= xx;
        faddd   %f60,K4,%f60            ! (0_0) res += K4;

        lda     [%l6]%asi,%g1           ! (4_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (3_0) iexp << 52;
        and     %i5,-16,%i5             ! (3_0) hx = -16;

        add     %i5,TBL,%i5             ! (3_0) addr = (char*)arr + hx;
        stx     %o7,[%fp+tmp5]          ! (3_0) dlexp = *(double*)lexp;

        fmuld   %f28,%f24,%f36          ! (2_0) xx *= dtmp0;
        add     %l6,stridex,%i0         ! px += stridex
        ldd     [%i5],%f28              ! (3_0) dtmp0 = ((double*)addr)[0];
        faddd   %f62,K5,%f62            ! (1_0) res += K5;

        faddd   %f22,K2,%f10            ! (6_1) res += K2;
        fmuld   %f60,%f32,%f34          ! (0_0) res *= xx;

        sra     %g1,21,%o7              ! (4_0) iexp = hx >> 21;
        lda     [%i0]%asi,%f0           ! (5_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f24           ! (4_0) res = vis_for(res,DC1);

        sra     %g1,10,%o2              ! (4_0) hx >>= 10;
        cmp     %g1,_0x7ff00000         ! (4_0) hx ? 0x7ff00000
        bge,pn  %icc,.update8           ! (4_0) if ( hx >= 0x7ff00000 )
        lda     [%i0+4]%asi,%f1         ! (5_0) ((float*)res)[1] = ((float*)px)[1];
.cont8:
        fand    %f18,DC3,%f40           ! (3_0) res_c = vis_fand(res_c,DC3);
        fmuld   %f62,%f38,%f62          ! (1_0) res *= xx;

        fmuld   %f10,%f26,%f58          ! (6_1) res *= xx;
        cmp     %g1,_0x00100000         ! (4_0) hx ? 0x00100000
        and     %o2,2040,%o2            ! (4_0) hx &= 0x7f8;
        faddd   %f34,K3,%f60            ! (0_0) res += K3;

        fmuld   %f28,%f28,%f28          ! (3_0) dtmp0 = dexp_hi * dexp_hi;
        bl,pn   %icc,.update9           ! (4_0) if ( hx < 0x00100000 )
        sub     %g0,%o7,%o7             ! (4_0) iexp = -iexp;
        fand    %f0,DC0,%f16            ! (5_0) res = vis_fand(res,DC0);
.cont9:
        add     %o7,1534,%o7            ! (4_0) iexp += 0x5fe;
        fpadd32 %f24,DC2,%f18           ! (4_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f36,%f10            ! (2_0) res = K6 * xx;
        add     %o2,8,%l1               ! (4_0) hx += 8;
        fsubd   %f44,%f40,%f44          ! (3_0) xx = res - res_c;

        fmuld   %f60,%f32,%f60          ! (0_0) res *= xx;
        faddd   %f62,K4,%f6             ! (1_0) res += K4;

        lda     [%i0]%asi,%g1           ! (5_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (4_0) iexp << 52;
        and     %l1,-16,%l1             ! (4_0) hx = -16;
        faddd   %f58,K1,%f58            ! (6_1) res += K1;

        add     %i0,stridex,%i1         ! px += stridex
        add     %l1,TBL,%l1             ! (4_0) addr = (char*)arr + hx;
        stx     %o7,[%fp+tmp6]          ! (4_0) dlexp = *(double*)lexp;

        fmuld   %f44,%f28,%f40          ! (3_0) xx *= dtmp0;
        ldd     [%l1],%f44              ! (4_0) dtmp0 = ((double*)addr)[0];
        faddd   %f10,K5,%f62            ! (2_0) res += K5;

        fmuld   %f6,%f38,%f34           ! (1_0) res *= xx;
        sra     %g1,21,%o7              ! (5_0) iexp = hx >> 21;
        nop
        faddd   %f60,K2,%f60            ! (0_0) res += K2;

        for     %f16,DC1,%f28           ! (5_0) res = vis_for(res,DC1);
        sub     %g0,%o7,%o7             ! (5_0) iexp = -iexp;
        lda     [%i1]%asi,%f6           ! (6_0) ((float*)res)[0] = ((float*)px)[0];
        fmuld   %f58,%f26,%f26          ! (6_1) res *= xx;

        sra     %g1,10,%o2              ! (5_0) hx >>= 10;
        cmp     %g1,_0x7ff00000         ! (5_0) hx ? 0x7ff00000
        bge,pn  %icc,.update10          ! (5_0) if ( hx >= 0x7ff00000 )
        lda     [%i1+4]%asi,%f7         ! (6_0) ((float*)res)[1] = ((float*)px)[1];
.cont10:
        fand    %f18,DC3,%f8            ! (4_0) res_c = vis_fand(res_c,DC3);
        fmuld   %f62,%f36,%f62          ! (2_0) res *= xx;

        fmuld   %f60,%f32,%f58          ! (0_0) res *= xx;
        cmp     %g1,_0x00100000         ! (5_0) hx ? 0x00100000
        and     %o2,2040,%o2            ! (5_0) hx &= 0x7f8;
        faddd   %f34,K3,%f34            ! (1_0) res += K3;

        fmuld   %f30,%f26,%f26          ! (6_1) res = dexp_hi * res;
        bl,pn   %icc,.update11          ! (5_0) if ( hx < 0x00100000 )
        nop
        fand    %f6,DC0,%f16            ! (6_0) res = vis_fand(res,DC0);
.cont11:
        ldd     [%l4+8],%f60            ! (6_1) dexp_lo = ((double*)addr)[1];
        fmuld   %f44,%f44,%f44          ! (4_0) dtmp0 = dexp_hi * dexp_hi;
        fpadd32 %f28,DC2,%f18           ! (5_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f40,%f22            ! (3_0) res = K6 * xx;
        add     %o2,8,%i3               ! (5_0) hx += 8;
        fsubd   %f24,%f8,%f10           ! (4_0) xx = res - res_c;

        fmuld   %f34,%f38,%f24          ! (1_0) res *= xx;
        or      %g0,%o4,%i0

        cmp     counter,7
        bl,pn   %icc,.tail
        faddd   %f62,K4,%f34            ! (2_0) res += K4;

        ba      .main_loop
        sub     counter,7,counter       ! counter

        .align  16
.main_loop:
        add     %o7,1534,%o7            ! (5_0) iexp += 0x5fe;
        and     %i3,-16,%i3             ! (5_1) hx = -16;
        lda     [%i1]%asi,%g1           ! (6_1) hx = *(int*)px;
        faddd   %f58,K1,%f58            ! (0_1) res += K1;

        add     %i3,TBL,%i3             ! (5_1) addr = (char*)arr + hx;
        sllx    %o7,52,%o7              ! (5_1) iexp << 52;
        stx     %o7,[%fp+tmp0]          ! (5_1) dlexp = *(double*)lexp;
        faddd   %f26,%f60,%f8           ! (6_2) res += dexp_lo;

        faddd   %f22,K5,%f62            ! (3_1) res += K5;
        add     %i1,stridex,%l6         ! px += stridex
        ldd     [%i3],%f22              ! (5_1) dtmp0 = ((double*)addr)[0];
        fmuld   %f10,%f44,%f60          ! (4_1) xx *= dtmp0;

        faddd   %f24,K2,%f26            ! (1_1) res += K2;
        add     %i0,stridey,%i1         ! px += stridey
        ldd     [%l2],%f24              ! (0_1) dexp_hi = ((double*)addr)[0];
        fmuld   %f34,%f36,%f34          ! (2_1) res *= xx;

        fmuld   %f58,%f32,%f58          ! (0_1) res *= xx;
        sra     %g1,21,%o7              ! (6_1) iexp = hx >> 21;
        lda     [%l6]%asi,%f0           ! (0_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f44           ! (6_1) res = vis_for(res,DC1);

        lda     [%l6+4]%asi,%f1         ! (0_0) ((float*)res)[1] = ((float*)px)[1];
        sra     %g1,10,%o2              ! (6_1) hx >>= 10;
        fmuld   %f22,%f22,%f10          ! (5_1) dtmp0 = dexp_hi * dexp_hi;
        faddd   %f8,%f30,%f30           ! (6_2) res += dexp_hi;

        fmuld   %f62,%f40,%f32          ! (3_1) res *= xx;
        cmp     %g1,_0x7ff00000         ! (6_1) hx ? 0x7ff00000
        ldd     [%fp+tmp1],%f62         ! (6_2) dlexp = *(double*)lexp;
        fand    %f18,DC3,%f8            ! (5_1) res_c = vis_fand(res_c,DC3);

        fmuld   %f26,%f38,%f26          ! (1_1) res *= xx;
        bge,pn  %icc,.update12          ! (6_1) if ( hx >= 0x7ff00000 )
        and     %o2,2040,%o2            ! (6_1) hx &= 0x7f8;
        faddd   %f34,K3,%f34            ! (2_1) res += K3;
.cont12:
        fmuld   %f24,%f58,%f58          ! (0_1) res = dexp_hi * res;
        cmp     %g1,_0x00100000         ! (6_1) hx ? 0x00100000
        sub     %g0,%o7,%o7             ! (6_1) iexp = -iexp;
        fand    %f0,DC0,%f16            ! (0_0) res = vis_fand(res,DC0);

        fmuld   %f30,%f62,%f2           ! (6_2) res *= dlexp;
        bl,pn   %icc,.update13          ! (6_1) if ( hx < 0x00100000 )
        ldd     [%l2+8],%f30            ! (0_1) dexp_lo = ((double*)addr)[1];
        fpadd32 %f44,DC2,%f18           ! (6_1) res_c = vis_fpadd32(res,DC2);
.cont13:
        fmuld   K6,%f60,%f62            ! (4_1) res = K6 * xx;
        add     %o2,8,%l4               ! (6_1) hx += 8;
        st      %f2,[%i0]               ! (6_2) ((float*)py)[0] = ((float*)res)[0];
        fsubd   %f28,%f8,%f6            ! (5_1) xx = res - res_c;

        fmuld   %f34,%f36,%f28          ! (2_1) res *= xx;
        add     %o7,1534,%o7            ! (6_1) iexp += 0x5fe;
        st      %f3,[%i0+4]             ! (6_2) ((float*)py)[1] = ((float*)res)[1];
        faddd   %f32,K4,%f32            ! (3_1) res += K4;

        lda     [%l6]%asi,%g1           ! (0_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (6_1) iexp << 52;
        and     %l4,-16,%l4             ! (6_1) hx = -16;
        faddd   %f26,K1,%f26            ! (1_1) res += K1;

        add     %i1,stridey,%i0         ! px += stridey
        add     %l4,TBL,%l4             ! (6_1) addr = (char*)arr + hx;
        stx     %o7,[%fp+tmp1]          ! (6_1) dlexp = *(double*)lexp;
        faddd   %f58,%f30,%f8           ! (0_1) res += dexp_lo;

        fmuld   %f6,%f10,%f58           ! (5_1) xx *= dtmp0;
        add     %l6,stridex,%l6         ! px += stridex
        ldd     [%l4],%f30              ! (6_1) dtmp0 = ((double*)addr)[0];
        faddd   %f62,K5,%f62            ! (4_1) res += K5;

        fmuld   %f32,%f40,%f34          ! (3_1) res *= xx;
        sra     %g1,10,%o2              ! (0_0) hx >>= 10;
        ldd     [%i2],%f4               ! (1_1) dexp_hi = ((double*)addr)[0];
        faddd   %f28,K2,%f32            ! (2_1) res += K2;

        fmuld   %f26,%f38,%f26          ! (1_1) res *= xx;
        sra     %g1,21,%o7              ! (0_0) iexp = hx >> 21;
        lda     [%l6]%asi,%f6           ! (1_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f28           ! (0_0) res = vis_for(res,DC1);

        fmuld   %f30,%f30,%f30          ! (6_1) dtmp0 = dexp_hi * dexp_hi;
        sub     %g0,%o7,%o7             ! (0_0) iexp = -iexp;
        lda     [%l6+4]%asi,%f7         ! (1_0) ((float*)res)[1] = ((float*)px)[1];
        faddd   %f8,%f24,%f24           ! (0_1) res += dexp_hi;

        fmuld   %f62,%f60,%f38          ! (4_1) res *= xx;
        cmp     %g1,_0x7ff00000         ! (0_0) hx ? 0x7ff00000
        ldd     [%fp+tmp2],%f62         ! (0_1) dlexp = *(double*)lexp;
        fand    %f18,DC3,%f8            ! (6_1) res_c = vis_fand(res_c,DC3);

        fmuld   %f32,%f36,%f32          ! (2_1) res *= xx;
        bge,pn  %icc,.update14          ! (0_0) if ( hx >= 0x7ff00000 )
        and     %o2,2040,%o2            ! (0_0) hx &= 0x7f8;
        faddd   %f34,K3,%f34            ! (3_1) res += K3;
.cont14:
        fmuld   %f4,%f26,%f26           ! (1_1) res = dexp_hi * res;
        cmp     %g1,_0x00100000         ! (0_0) hx ? 0x00100000
        add     %o7,1534,%o7            ! (0_0) iexp += 0x5fe;
        fand    %f6,DC0,%f16            ! (1_0) res = vis_fand(res,DC0);

        fmuld   %f24,%f62,%f2           ! (0_1) res *= dlexp;
        bl,pn   %icc,.update15          ! (0_0) if ( hx < 0x00100000 )
        ldd     [%i2+8],%f24            ! (1_1) dexp_lo = ((double*)addr)[1];
        fpadd32 %f28,DC2,%f18           ! (0_0) res_c = vis_fpadd32(res,DC2);
.cont15:
        fmuld   K6,%f58,%f62            ! (5_1) res = K6 * xx;
        add     %o2,8,%l2               ! (0_0) hx += 8;
        st      %f2,[%i1]               ! (0_1) ((float*)py)[0] = ((float*)res)[0];
        fsubd   %f44,%f8,%f10           ! (6_1) xx = res - res_c;

        fmuld   %f34,%f40,%f44          ! (3_1) res *= xx;
        nop
        st      %f3,[%i1+4]             ! (0_1) ((float*)py)[1] = ((float*)res)[1];
        faddd   %f38,K4,%f38            ! (4_1) res += K4;

        lda     [%l6]%asi,%g1           ! (1_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (0_0) iexp << 52;
        and     %l2,-16,%l2             ! (0_0) hx = -16;
        faddd   %f32,K1,%f32            ! (2_1) res += K1;

        add     %l2,TBL,%l2             ! (0_0) addr = (char*)arr + hx;
        add     %l6,stridex,%l6         ! px += stridex
        stx     %o7,[%fp+tmp2]          ! (0_0) dlexp = *(double*)lexp;
        faddd   %f26,%f24,%f8           ! (1_1) res += dexp_lo;

        fmuld   %f10,%f30,%f26          ! (6_1) xx *= dtmp0;
        add     %i0,stridey,%i1         ! px += stridey
        ldd     [%l2],%f30              ! (0_0) dtmp0 = ((double*)addr)[0];
        faddd   %f62,K5,%f62            ! (5_1) res += K5;

        fmuld   %f38,%f60,%f34          ! (4_1) res *= xx;
        sra     %g1,10,%o2              ! (1_0) hx >>= 10;
        ldd     [%i4],%f24              ! (2_1) dexp_hi = ((double*)addr)[0];
        faddd   %f44,K2,%f38            ! (3_1) res += K2;

        fmuld   %f32,%f36,%f32          ! (2_1) res *= xx;
        sra     %g1,21,%o7              ! (1_0) iexp = hx >> 21;
        lda     [%l6]%asi,%f0           ! (2_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f44           ! (1_0) res = vis_for(res,DC1);

        fmuld   %f30,%f30,%f30          ! (0_0) dtmp0 = dexp_hi * dexp_hi;
        cmp     %g1,_0x7ff00000         ! (1_0) hx ? 0x7ff00000
        lda     [%l6+4]%asi,%f1         ! (2_0) ((float*)res)[1] = ((float*)px)[1];
        faddd   %f8,%f4,%f4             ! (1_1) res += dexp_hi;

        fmuld   %f62,%f58,%f36          ! (5_1) res *= xx;
        bge,pn  %icc,.update16          ! (1_0) if ( hx >= 0x7ff00000 )
        ldd     [%fp+tmp3],%f62         ! (1_1) dlexp = *(double*)lexp;
        fand    %f18,DC3,%f8            ! (0_0) res_c = vis_fand(res_c,DC3);
.cont16:
        fmuld   %f38,%f40,%f38          ! (3_1) res *= xx;
        cmp     %g1,_0x00100000         ! (1_0) hx ? 0x00100000
        and     %o2,2040,%o2            ! (1_0) hx &= 0x7f8;
        faddd   %f34,K3,%f34            ! (4_1) res += K3;

        fmuld   %f24,%f32,%f32          ! (2_1) res = dexp_hi * res;
        bl,pn   %icc,.update17          ! (1_0) if ( hx < 0x00100000 )
        sub     %g0,%o7,%o7             ! (1_0) iexp = -iexp;
        fand    %f0,DC0,%f16            ! (2_0) res = vis_fand(res,DC0);
.cont17:
        fmuld   %f4,%f62,%f2            ! (1_1) res *= dlexp;
        add     %o7,1534,%o7            ! (1_0) iexp += 0x5fe;
        ldd     [%i4+8],%f4             ! (2_1) dexp_lo = ((double*)addr)[1];
        fpadd32 %f44,DC2,%f18           ! (1_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f26,%f62            ! (6_1) res = K6 * xx;
        add     %o2,8,%i2               ! (1_0) hx += 8;
        st      %f2,[%i0]               ! (1_1) ((float*)py)[0] = ((float*)res)[0];
        fsubd   %f28,%f8,%f6            ! (0_0) xx = res - res_c;

        fmuld   %f34,%f60,%f28          ! (4_1) res *= xx;
        nop
        st      %f3,[%i0+4]             ! (1_1) ((float*)py)[1] = ((float*)res)[1];
        faddd   %f36,K4,%f36            ! (5_1) res += K4;

        lda     [%l6]%asi,%g1           ! (2_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (1_0) iexp << 52;
        and     %i2,-16,%i2             ! (1_0) hx = -16;
        faddd   %f38,K1,%f38            ! (3_1) res += K1;

        add     %i1,stridey,%i0         ! px += stridey
        add     %i2,TBL,%i2             ! (1_0) addr = (char*)arr + hx;
        stx     %o7,[%fp+tmp3]          ! (1_0) dlexp = *(double*)lexp;
        faddd   %f32,%f4,%f8            ! (2_1) res += dexp_lo;

        fmuld   %f6,%f30,%f32           ! (0_0) xx *= dtmp0;
        add     %l6,stridex,%l6         ! px += stridex
        ldd     [%i2],%f30              ! (1_0) dtmp0 = ((double*)addr)[0];
        faddd   %f62,K5,%f62            ! (6_1) res += K5;

        fmuld   %f36,%f58,%f34          ! (5_1) res *= xx;
        sra     %g1,10,%o2              ! (2_0) hx >>= 10;
        ldd     [%i5],%f4               ! (3_1) dexp_hi = ((double*)addr)[0];
        faddd   %f28,K2,%f36            ! (4_1) res += K2;

        fmuld   %f38,%f40,%f38          ! (3_1) res *= xx;
        sra     %g1,21,%o7              ! (2_0) iexp = hx >> 21;
        lda     [%l6]%asi,%f6           ! (3_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f28           ! (2_0) res = vis_for(res,DC1);

        fmuld   %f30,%f30,%f30          ! (1_0) dtmp0 = dexp_hi * dexp_hi;
        cmp     %g1,_0x7ff00000         ! (2_0) hx ? 0x7ff00000
        lda     [%l6+4]%asi,%f7         ! (3_0) ((float*)res)[1] = ((float*)px)[1];
        faddd   %f8,%f24,%f24           ! (2_1) res += dexp_hi;

        fmuld   %f62,%f26,%f40          ! (6_1) res *= xx;
        bge,pn  %icc,.update18          ! (2_0) if ( hx >= 0x7ff00000 )
        ldd     [%fp+tmp4],%f62         ! (2_1) dlexp = *(double*)lexp;
        fand    %f18,DC3,%f8            ! (1_0) res_c = vis_fand(res_c,DC3);
.cont18:
        fmuld   %f36,%f60,%f36          ! (4_1) res *= xx;
        cmp     %g1,_0x00100000         ! (2_0) hx ? 0x00100000
        and     %o2,2040,%o2            ! (2_0) hx &= 0x7f8;
        faddd   %f34,K3,%f34            ! (5_1) res += K3;

        fmuld   %f4,%f38,%f38           ! (3_1) res = dexp_hi * res;
        bl,pn   %icc,.update19          ! (2_0) if ( hx < 0x00100000 )
        sub     %g0,%o7,%o7             ! (2_0) iexp = -iexp;
        fand    %f6,DC0,%f16            ! (3_0) res = vis_fand(res,DC0);
.cont19:
        fmuld   %f24,%f62,%f2           ! (2_1) res *= dlexp;
        add     %o7,1534,%o7            ! (2_0) iexp += 0x5fe;
        ldd     [%i5+8],%f24            ! (3_1) dexp_lo = ((double*)addr)[1];
        fpadd32 %f28,DC2,%f18           ! (2_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f32,%f62            ! (0_0) res = K6 * xx;
        add     %o2,8,%i4               ! (2_0) hx += 8;
        st      %f2,[%i1]               ! (2_1) ((float*)py)[0] = ((float*)res)[0];
        fsubd   %f44,%f8,%f10           ! (1_0) xx = res - res_c;

        fmuld   %f34,%f58,%f44          ! (5_1) res *= xx;
        nop
        st      %f3,[%i1+4]             ! (2_1) ((float*)py)[1] = ((float*)res)[1];
        faddd   %f40,K4,%f40            ! (6_1) res += K4;

        lda     [%l6]%asi,%g1           ! (3_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (2_0) iexp << 52;
        and     %i4,-16,%i4             ! (2_0) hx = -16;
        faddd   %f36,K1,%f36            ! (4_1) res += K1;

        add     %l6,stridex,%l6         ! px += stridex
        add     %i4,TBL,%i4             ! (2_0) addr = (char*)arr + hx;
        stx     %o7,[%fp+tmp4]          ! (2_0) dlexp = *(double*)lexp;
        faddd   %f38,%f24,%f8           ! (3_1) res += dexp_lo;

        fmuld   %f10,%f30,%f38          ! (1_0) xx *= dtmp0;
        add     %i0,stridey,%i1         ! px += stridey
        ldd     [%i4],%f24              ! (2_0) dtmp0 = ((double*)addr)[0];
        faddd   %f62,K5,%f62            ! (0_0) res += K5;

        fmuld   %f40,%f26,%f34          ! (6_1) res *= xx;
        sra     %g1,10,%o2              ! (3_0) hx >>= 10;
        ldd     [%l1],%f30              ! (4_1) dexp_hi = ((double*)addr)[0];
        faddd   %f44,K2,%f40            ! (5_1) res += K2;

        fmuld   %f36,%f60,%f36          ! (4_1) res *= xx;
        sra     %g1,21,%o7              ! (3_0) iexp = hx >> 21;
        lda     [%l6]%asi,%f0           ! (4_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f44           ! (3_0) res = vis_for(res,DC1);

        fmuld   %f24,%f24,%f24          ! (2_0) dtmp0 = dexp_hi * dexp_hi;
        cmp     %g1,_0x7ff00000         ! (3_0) hx ? 0x7ff00000
        lda     [%l6+4]%asi,%f1         ! (4_0) ((float*)res)[1] = ((float*)px)[1];
        faddd   %f8,%f4,%f8             ! (3_1) res += dexp_hi;

        fmuld   %f62,%f32,%f60          ! (0_0) res *= xx;
        bge,pn  %icc,.update20          ! (3_0) if ( hx >= 0x7ff00000 )
        ldd     [%fp+tmp5],%f62         ! (3_1) dlexp = *(double*)lexp;
        fand    %f18,DC3,%f4            ! (2_0) res_c = vis_fand(res_c,DC3);
.cont20:
        fmuld   %f40,%f58,%f40          ! (5_1) res *= xx;
        cmp     %g1,_0x00100000         ! (3_0) hx ? 0x00100000
        and     %o2,2040,%o2            ! (3_0) hx &= 0x7f8;
        faddd   %f34,K3,%f10            ! (6_1) res += K3;

        fmuld   %f30,%f36,%f36          ! (4_1) res = dexp_hi * res;
        bl,pn   %icc,.update21          ! (3_0) if ( hx < 0x00100000 )
        sub     %g0,%o7,%o7             ! (3_0) iexp = -iexp;
        fand    %f0,DC0,%f16            ! (4_0) res = vis_fand(res,DC0);
.cont21:
        fmuld   %f8,%f62,%f8            ! (3_1) res *= dlexp;
        add     %o7,1534,%o7            ! (3_0) iexp += 0x5fe;
        ldd     [%l1+8],%f34            ! (4_1) dexp_lo = ((double*)addr)[1];
        fpadd32 %f44,DC2,%f18           ! (3_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f38,%f62            ! (1_0) res = K6 * xx;
        add     %o2,8,%i5               ! (3_0) hx += 8;
        st      %f8,[%i0]               ! (3_1) ((float*)py)[0] = ((float*)res)[0];
        fsubd   %f28,%f4,%f28           ! (2_0) xx = res - res_c;

        fmuld   %f10,%f26,%f4           ! (6_1) res *= xx;
        nop
        st      %f9,[%i0+4]             ! (3_1) ((float*)py)[1] = ((float*)res)[1];
        faddd   %f60,K4,%f60            ! (0_0) res += K4;

        lda     [%l6]%asi,%g1           ! (4_0) hx = *(int*)px;
        sllx    %o7,52,%o7              ! (3_0) iexp << 52;
        and     %i5,-16,%i5             ! (3_0) hx = -16;
        faddd   %f40,K1,%f40            ! (5_1) res += K1;

        add     %l6,stridex,%i0         ! px += stridex
        add     %i5,TBL,%i5             ! (3_0) addr = (char*)arr + hx;
        stx     %o7,[%fp+tmp5]          ! (3_0) dlexp = *(double*)lexp;
        faddd   %f36,%f34,%f8           ! (4_1) res += dexp_lo;

        fmuld   %f28,%f24,%f36          ! (2_0) xx *= dtmp0;
        add     %i1,stridey,%l6         ! px += stridey
        ldd     [%i5],%f28              ! (3_0) dtmp0 = ((double*)addr)[0];
        faddd   %f62,K5,%f62            ! (1_0) res += K5;

        faddd   %f4,K2,%f10             ! (6_1) res += K2;
        sra     %g1,10,%o2              ! (4_0) hx >>= 10;
        nop
        fmuld   %f60,%f32,%f34          ! (0_0) res *= xx;

        fmuld   %f40,%f58,%f40          ! (5_1) res *= xx;
        sra     %g1,21,%o7              ! (4_0) iexp = hx >> 21;
        lda     [%i0]%asi,%f6           ! (5_0) ((float*)res)[0] = ((float*)px)[0];
        for     %f16,DC1,%f24           ! (4_0) res = vis_for(res,DC1);

        fmuld   %f28,%f28,%f28          ! (3_0) dtmp0 = dexp_hi * dexp_hi;
        cmp     %g1,_0x7ff00000         ! (4_0) hx ? 0x7ff00000
        lda     [%i0+4]%asi,%f7         ! (5_0) ((float*)res)[1] = ((float*)px)[1];
        faddd   %f8,%f30,%f30           ! (4_1) res += dexp_hi;

        fand    %f18,DC3,%f8            ! (3_0) res_c = vis_fand(res_c,DC3);
        bge,pn  %icc,.update22          ! (4_0) if ( hx >= 0x7ff00000 )
        ldd     [%fp+tmp6],%f18         ! (4_1) dlexp = *(double*)lexp;
        fmuld   %f62,%f38,%f62          ! (1_0) res *= xx;
.cont22:
        fmuld   %f10,%f26,%f58          ! (6_1) res *= xx;
        cmp     %g1,_0x00100000         ! (4_0) hx ? 0x00100000
        and     %o2,2040,%o2            ! (4_0) hx &= 0x7f8;
        faddd   %f34,K3,%f60            ! (0_0) res += K3;

        fmuld   %f22,%f40,%f40          ! (5_1) res = dexp_hi * res;
        bl,pn   %icc,.update23          ! (4_0) if ( hx < 0x00100000 )
        sub     %g0,%o7,%o7             ! (4_0) iexp = -iexp;
        fand    %f6,DC0,%f16            ! (5_0) res = vis_fand(res,DC0);
.cont23:
        fmuld   %f30,%f18,%f6           ! (4_1) res *= dlexp;
        add     %o7,1534,%o7            ! (4_0) iexp += 0x5fe;
        ldd     [%i3+8],%f34            ! (5_1) dexp_lo = ((double*)addr)[1];
        fpadd32 %f24,DC2,%f18           ! (4_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f36,%f30            ! (2_0) res = K6 * xx;
        add     %o2,8,%l1               ! (4_0) hx += 8;
        st      %f6,[%i1]               ! (4_1) ((float*)py)[0] = ((float*)res)[0];
        fsubd   %f44,%f8,%f44           ! (3_0) xx = res - res_c;

        fmuld   %f60,%f32,%f60          ! (0_0) res *= xx;
        sllx    %o7,52,%o7              ! (4_0) iexp << 52;
        st      %f7,[%i1+4]             ! (4_1) ((float*)py)[1] = ((float*)res)[1];
        faddd   %f62,K4,%f6             ! (1_0) res += K4;

        lda     [%i0]%asi,%g1           ! (5_0) hx = *(int*)px;
        add     %i0,stridex,%i1         ! px += stridex
        and     %l1,-16,%l1             ! (4_0) hx = -16;
        faddd   %f58,K1,%f58            ! (6_1) res += K1;

        add     %l1,TBL,%l1             ! (4_0) addr = (char*)arr + hx;
        add     %l6,stridey,%i0         ! px += stridey
        stx     %o7,[%fp+tmp6]          ! (4_0) dlexp = *(double*)lexp;
        faddd   %f40,%f34,%f8           ! (5_1) res += dexp_lo;

        fmuld   %f44,%f28,%f40          ! (3_0) xx *= dtmp0;
        nop
        ldd     [%l1],%f44              ! (4_0) dtmp0 = ((double*)addr)[0];
        faddd   %f30,K5,%f62            ! (2_0) res += K5;

        fmuld   %f6,%f38,%f34           ! (1_0) res *= xx;
        sra     %g1,21,%o7              ! (5_0) iexp = hx >> 21;
        ldd     [%l4],%f30              ! (6_1) dexp_hi = ((double*)addr)[0];
        faddd   %f60,K2,%f60            ! (0_0) res += K2;

        for     %f16,DC1,%f28           ! (5_0) res = vis_for(res,DC1);
        sub     %g0,%o7,%o7             ! (5_0) iexp = -iexp;
        lda     [%i1]%asi,%f6           ! (6_0) ((float*)res)[0] = ((float*)px)[0];
        fmuld   %f58,%f26,%f26          ! (6_1) res *= xx;

        fmuld   %f44,%f44,%f44          ! (4_0) dtmp0 = dexp_hi * dexp_hi;
        cmp     %g1,_0x7ff00000         ! (5_0) hx ? 0x7ff00000
        lda     [%i1+4]%asi,%f7         ! (6_0) ((float*)res)[1] = ((float*)px)[1];
        faddd   %f8,%f22,%f22           ! (5_1) res += dexp_hi;

        fand    %f18,DC3,%f8            ! (4_0) res_c = vis_fand(res_c,DC3);
        bge,pn  %icc,.update24          ! (5_0) if ( hx >= 0x7ff00000 )
        ldd     [%fp+tmp0],%f18         ! (5_1) dlexp = *(double*)lexp;
        fmuld   %f62,%f36,%f62          ! (2_0) res *= xx;
.cont24:
        fmuld   %f60,%f32,%f58          ! (0_0) res *= xx;
        sra     %g1,10,%o2              ! (5_0) hx >>= 10;
        cmp     %g1,_0x00100000         ! (5_0) hx ? 0x00100000
        faddd   %f34,K3,%f34            ! (1_0) res += K3;

        fmuld   %f30,%f26,%f26          ! (6_1) res = dexp_hi * res;
        bl,pn   %icc,.update25          ! (5_0) if ( hx < 0x00100000 )
        and     %o2,2040,%o2            ! (5_0) hx &= 0x7f8;
        fand    %f6,DC0,%f16            ! (6_0) res = vis_fand(res,DC0);
.cont25:
        fmuld   %f22,%f18,%f2           ! (5_1) res *= dlexp;
        subcc   counter,7,counter       ! counter -= 7;
        ldd     [%l4+8],%f60            ! (6_1) dexp_lo = ((double*)addr)[1];
        fpadd32 %f28,DC2,%f18           ! (5_0) res_c = vis_fpadd32(res,DC2);

        fmuld   K6,%f40,%f22            ! (3_0) res = K6 * xx;
        add     %o2,8,%i3               ! (5_0) hx += 8;
        st      %f2,[%l6]               ! (5_1) ((float*)py)[0] = ((float*)res)[0];
        fsubd   %f24,%f8,%f10           ! (4_0) xx = res - res_c;

        fmuld   %f34,%f38,%f24          ! (1_0) res *= xx;
        st      %f3,[%l6+4]             ! (5_1) ((float*)py)[1] = ((float*)res)[1];
        bpos,pt %icc,.main_loop
        faddd   %f62,K4,%f34            ! (2_0) res += K4;

        add     counter,7,counter
.tail:
        add     %o7,1534,%o7            ! (5_0) iexp += 0x5fe;
        subcc   counter,1,counter
        bneg,a  .begin
        mov     %i0,%o4

        faddd   %f58,K1,%f58            ! (0_1) res += K1;

        faddd   %f26,%f60,%f8           ! (6_2) res += dexp_lo;

        faddd   %f22,K5,%f62            ! (3_1) res += K5;
        fmuld   %f10,%f44,%f60          ! (4_1) xx *= dtmp0;

        faddd   %f24,K2,%f26            ! (1_1) res += K2;
        add     %i1,stridex,%l6         ! px += stridex
        ldd     [%l2],%f24              ! (0_1) dexp_hi = ((double*)addr)[0];
        fmuld   %f34,%f36,%f34          ! (2_1) res *= xx;

        fmuld   %f58,%f32,%f58          ! (0_1) res *= xx;

        add     %i0,stridey,%i1         ! px += stridey
        faddd   %f8,%f30,%f30           ! (6_2) res += dexp_hi;

        fmuld   %f62,%f40,%f32          ! (3_1) res *= xx;
        ldd     [%fp+tmp1],%f62         ! (6_2) dlexp = *(double*)lexp;

        fmuld   %f26,%f38,%f26          ! (1_1) res *= xx;
        faddd   %f34,K3,%f34            ! (2_1) res += K3;

        fmuld   %f24,%f58,%f58          ! (0_1) res = dexp_hi * res;

        fmuld   %f30,%f62,%f2           ! (6_2) res *= dlexp;
        ldd     [%l2+8],%f30            ! (0_1) dexp_lo = ((double*)addr)[1];

        fmuld   K6,%f60,%f62            ! (4_1) res = K6 * xx;
        st      %f2,[%i0]               ! (6_2) ((float*)py)[0] = ((float*)res)[0];

        fmuld   %f34,%f36,%f28          ! (2_1) res *= xx;
        st      %f3,[%i0+4]             ! (6_2) ((float*)py)[1] = ((float*)res)[1];
        faddd   %f32,K4,%f32            ! (3_1) res += K4;

        subcc   counter,1,counter
        bneg,a  .begin
        mov     %i1,%o4

        faddd   %f26,K1,%f26            ! (1_1) res += K1;

        faddd   %f58,%f30,%f8           ! (0_1) res += dexp_lo;

        add     %l6,stridex,%l6         ! px += stridex
        faddd   %f62,K5,%f62            ! (4_1) res += K5;

        fmuld   %f32,%f40,%f34          ! (3_1) res *= xx;
        add     %i1,stridey,%i0         ! px += stridey
        ldd     [%i2],%f22              ! (1_1) dexp_hi = ((double*)addr)[0];
        faddd   %f28,K2,%f32            ! (2_1) res += K2;

        fmuld   %f26,%f38,%f26          ! (1_1) res *= xx;

        faddd   %f8,%f24,%f24           ! (0_1) res += dexp_hi;

        fmuld   %f62,%f60,%f38          ! (4_1) res *= xx;
        ldd     [%fp+tmp2],%f62         ! (0_1) dlexp = *(double*)lexp;

        fmuld   %f32,%f36,%f32          ! (2_1) res *= xx;
        faddd   %f34,K3,%f34            ! (3_1) res += K3;

        fmuld   %f22,%f26,%f26          ! (1_1) res = dexp_hi * res;

        fmuld   %f24,%f62,%f2           ! (0_1) res *= dlexp;
        ldd     [%i2+8],%f24            ! (1_1) dexp_lo = ((double*)addr)[1];

        st      %f2,[%i1]               ! (0_1) ((float*)py)[0] = ((float*)res)[0];

        fmuld   %f34,%f40,%f44          ! (3_1) res *= xx;
        st      %f3,[%i1+4]             ! (0_1) ((float*)py)[1] = ((float*)res)[1];
        faddd   %f38,K4,%f38            ! (4_1) res += K4;

        subcc   counter,1,counter
        bneg,a  .begin
        mov     %i0,%o4

        faddd   %f32,K1,%f32            ! (2_1) res += K1;

        add     %l6,stridex,%l6         ! px += stridex
        faddd   %f26,%f24,%f8           ! (1_1) res += dexp_lo;

        add     %i0,stridey,%i1         ! px += stridey

        fmuld   %f38,%f60,%f34          ! (4_1) res *= xx;
        ldd     [%i4],%f24              ! (2_1) dexp_hi = ((double*)addr)[0];
        faddd   %f44,K2,%f38            ! (3_1) res += K2;

        fmuld   %f32,%f36,%f32          ! (2_1) res *= xx;

        faddd   %f8,%f22,%f22           ! (1_1) res += dexp_hi;

        ldd     [%fp+tmp3],%f62         ! (1_1) dlexp = *(double*)lexp;

        fmuld   %f38,%f40,%f38          ! (3_1) res *= xx;
        faddd   %f34,K3,%f34            ! (4_1) res += K3;

        fmuld   %f24,%f32,%f32          ! (2_1) res = dexp_hi * res;

        fmuld   %f22,%f62,%f2           ! (1_1) res *= dlexp;
        ldd     [%i4+8],%f22            ! (2_1) dexp_lo = ((double*)addr)[1];

        st      %f2,[%i0]               ! (1_1) ((float*)py)[0] = ((float*)res)[0];

        fmuld   %f34,%f60,%f28          ! (4_1) res *= xx;
        st      %f3,[%i0+4]             ! (1_1) ((float*)py)[1] = ((float*)res)[1];

        subcc   counter,1,counter
        bneg,a  .begin
        mov     %i1,%o4

        faddd   %f38,K1,%f38            ! (3_1) res += K1;

        faddd   %f32,%f22,%f8           ! (2_1) res += dexp_lo;

        add     %l6,stridex,%l6         ! px += stridex

        add     %i1,stridey,%i0         ! px += stridey
        ldd     [%i5],%f22              ! (3_1) dexp_hi = ((double*)addr)[0];
        faddd   %f28,K2,%f36            ! (4_1) res += K2;

        fmuld   %f38,%f40,%f38          ! (3_1) res *= xx;

        faddd   %f8,%f24,%f24           ! (2_1) res += dexp_hi;

        ldd     [%fp+tmp4],%f62         ! (2_1) dlexp = *(double*)lexp;

        fmuld   %f36,%f60,%f36          ! (4_1) res *= xx;

        fmuld   %f22,%f38,%f38          ! (3_1) res = dexp_hi * res;

        fmuld   %f24,%f62,%f2           ! (2_1) res *= dlexp;
        ldd     [%i5+8],%f24            ! (3_1) dexp_lo = ((double*)addr)[1];

        st      %f2,[%i1]               ! (2_1) ((float*)py)[0] = ((float*)res)[0];

        st      %f3,[%i1+4]             ! (2_1) ((float*)py)[1] = ((float*)res)[1];

        subcc   counter,1,counter
        bneg,a  .begin
        mov     %i0,%o4

        faddd   %f36,K1,%f36            ! (4_1) res += K1;

        faddd   %f38,%f24,%f8           ! (3_1) res += dexp_lo;

        add     %i0,stridey,%i1         ! px += stridey

        add     %l6,stridex,%l6         ! px += stridex
        ldd     [%l1],%f30              ! (4_1) dexp_hi = ((double*)addr)[0];

        fmuld   %f36,%f60,%f36          ! (4_1) res *= xx;

        faddd   %f8,%f22,%f8            ! (3_1) res += dexp_hi;

        ldd     [%fp+tmp5],%f62         ! (3_1) dlexp = *(double*)lexp;

        fmuld   %f30,%f36,%f36          ! (4_1) res = dexp_hi * res;

        fmuld   %f8,%f62,%f8            ! (3_1) res *= dlexp;
        ldd     [%l1+8],%f34            ! (4_1) dexp_lo = ((double*)addr)[1];

        st      %f8,[%i0]               ! (3_1) ((float*)py)[0] = ((float*)res)[0];

        st      %f9,[%i0+4]             ! (3_1) ((float*)py)[1] = ((float*)res)[1];

        subcc   counter,1,counter
        bneg,a  .begin
        mov     %i1,%o4

        faddd   %f36,%f34,%f8           ! (4_1) res += dexp_lo;

        add     %l6,stridex,%i0         ! px += stridex

        add     %i1,stridey,%l6         ! px += stridey

        faddd   %f8,%f30,%f30           ! (4_1) res += dexp_hi;

        ldd     [%fp+tmp6],%f18         ! (4_1) dlexp = *(double*)lexp;

        fmuld   %f30,%f18,%f6           ! (4_1) res *= dlexp;

        st      %f6,[%i1]               ! (4_1) ((float*)py)[0] = ((float*)res)[0];

        st      %f7,[%i1+4]             ! (4_1) ((float*)py)[1] = ((float*)res)[1];

        ba      .begin
        add     %i1,stridey,%o4

        .align  16
.spec0:
        fdivd   DONE,%f0,%f0            ! res = DONE / res;
        add     %i1,stridex,%i1         ! px += stridex
        st      %f0,[%o4]               ! ((float*)py)[0] = ((float*)&res)[0];
        st      %f1,[%o4+4]             ! ((float*)py)[1] = ((float*)&res)[1];
        add     %o4,stridey,%o4         ! py += stridey
        ba      .begin1
        sub     counter,1,counter

        .align  16
.spec1:
        orcc    %i2,%l4,%g0
        bz,a    2f
        fdivd   DONE,%f0,%f0            ! res = DONE / res;

        cmp     %g1,0
        bl,a    2f
        fsqrtd  %f0,%f0                 ! res = sqrt(res);

        cmp     %g1,%i4
        bge,a   1f
        ldd     [%o3+0x50],%f18

        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        st      %f0,[%fp+tmp0]

        fand    %f0,DC0,%f16            ! (6_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp0],%g1

        sra     %g1,21,%o7              ! (6_1) iexp = hx >> 21;
        for     %f16,DC1,%f44           ! (6_1) res = vis_for(res,DC1);

        sra     %g1,10,%o2              ! (6_1) hx >>= 10;
        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (6_1) hx &= 0x7f8;
        ba      .cont_spec
        sub     %g0,%o7,%o7             ! (6_1) iexp = -iexp;

1:
        fand    %f0,%f18,%f0            ! res = vis_fand(res,DC4);

        ldd     [%o3+0x58],%f28
        fxtod   %f0,%f0                 ! res = *(long long*)&res;

        faddd   %f0,%f28,%f0            ! res += D2ON51;
        st      %f0,[%fp+tmp0]

        fand    %f0,DC0,%f16            ! (6_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp0],%g1

        sra     %g1,21,%o7              ! (6_1) iexp = hx >> 21;
        for     %f16,DC1,%f44           ! (6_1) res = vis_for(res,DC1);

        sra     %g1,10,%o2              ! (6_1) hx >>= 10;
        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (6_1) hx &= 0x7f8;
        ba      .cont_spec
        sub     %g0,%o7,%o7             ! (6_1) iexp = -iexp;

2:
        add     %i1,stridex,%i1         ! px += stridex
        st      %f0,[%o4]               ! ((float*)py)[0] = ((float*)&res)[0];
        st      %f1,[%o4+4]             ! ((float*)py)[1] = ((float*)&res)[1];
        add     %o4,stridey,%o4         ! py += stridey
        ba      .begin1
        sub     counter,1,counter

        .align  16
.update0:
        cmp     counter,1
        ble     .cont0
        nop

        sub     %l6,stridex,tmp_px
        sub     counter,1,tmp_counter

        ba      .cont0
        mov     1,counter

        .align  16
.update1:
        cmp     counter,1
        ble     .cont1
        sub     %l6,stridex,%i1

        ld      [%i1+4],%i2
        cmp     %g1,0
        bl      1f

        orcc    %g1,%i2,%g0
        bz      1f
        sethi   %hi(0x00080000),%i3

        cmp     %g1,%i3
        bge,a   2f
        ldd     [%o3+0x50],%f18

        fxtod   %f8,%f8                 ! res = *(long long*)&res;
        st      %f8,[%fp+tmp7]

        fand    %f8,DC0,%f16            ! (0_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (0_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (0_0) hx >>= 10;
        for     %f16,DC1,%f28           ! (0_0) res = vis_for(res,DC1);

        sub     %o7,537,%o7

        sub     %g0,%o7,%o7             ! (0_0) iexp = -iexp;

        and     %o2,2040,%o2            ! (0_0) hx &= 0x7f8;
        ba      .cont1
        add     %o7,1534,%o7            ! (0_0) iexp += 0x5fe;
2:
        fand    %f8,%f18,%f8
        fxtod   %f8,%f8                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f18
        faddd   %f8,%f18,%f8
        st      %f8,[%fp+tmp7]

        fand    %f8,DC0,%f16            ! (0_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (0_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (0_0) hx >>= 10;
        for     %f16,DC1,%f28           ! (0_0) res = vis_for(res,DC1);

        sub     %o7,537,%o7

        sub     %g0,%o7,%o7             ! (0_0) iexp = -iexp;

        and     %o2,2040,%o2            ! (0_0) hx &= 0x7f8;
        ba      .cont1
        add     %o7,1534,%o7            ! (0_0) iexp += 0x5fe;
1:
        sub     %l6,stridex,tmp_px
        sub     counter,1,tmp_counter

        ba      .cont1
        mov     1,counter

        .align  16
.update2:
        cmp     counter,2
        ble     .cont2
        nop

        sub     %l6,stridex,tmp_px
        sub     counter,2,tmp_counter

        ba      .cont2
        mov     2,counter

        .align  16
.update3:
        cmp     counter,2
        ble     .cont3
        sub     %l6,stridex,%i1

        ld      [%i1+4],%i2
        cmp     %g1,0
        bl      1f

        orcc    %g1,%i2,%g0
        bz      1f
        sethi   %hi(0x00080000),%i3

        cmp     %g1,%i3
        bge,a   2f
        ldd     [%o3+0x50],%f18

        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f16            ! (1_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (1_0) iexp = hx >> 21;
        for     %f16,DC1,%f44           ! (1_0) res = vis_for(res,DC1);

        sra     %g1,10,%o2              ! (1_0) hx >>= 10;
        sub     %o7,537,%o7
        ba      .cont3
        and     %o2,2040,%o2            ! (1_0) hx &= 0x7f8;
2:
        fand    %f0,%f18,%f0
        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f18
        faddd   %f0,%f18,%f0
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f16            ! (1_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (1_0) iexp = hx >> 21;
        for     %f16,DC1,%f44           ! (1_0) res = vis_for(res,DC1);

        sra     %g1,10,%o2              ! (1_0) hx >>= 10;
        sub     %o7,537,%o7
        ba      .cont3
        and     %o2,2040,%o2            ! (1_0) hx &= 0x7f8;
1:
        sub     %l6,stridex,tmp_px
        sub     counter,2,tmp_counter

        ba      .cont3
        mov     2,counter

        .align  16
.update4:
        cmp     counter,3
        ble     .cont4
        nop

        sub     %l6,stridex,tmp_px
        sub     counter,3,tmp_counter

        ba      .cont4
        mov     3,counter

        .align  16
.update5:
        cmp     counter,3
        ble     .cont5
        sub     %l6,stridex,%i1

        ld      [%i1+4],%i3
        cmp     %g1,0
        bl      1f

        orcc    %g1,%i3,%g0
        bz      1f
        sethi   %hi(0x00080000),%i4

        cmp     %g1,%i4
        bge,a   2f
        ldd     [%o3+0x50],%f18

        fxtod   %f6,%f6                 ! res = *(long long*)&res;
        st      %f6,[%fp+tmp7]

        fand    %f6,DC0,%f16            ! (2_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (2_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (2_0) hx >>= 10;

        sub     %o7,537,%o7
        and     %o2,2040,%o2            ! (2_0) hx &= 0x7f8;
        ba      .cont5
        for     %f16,DC1,%f28           ! (2_0) res = vis_for(res,DC1);
2:
        fand    %f6,%f18,%f6
        fxtod   %f6,%f6                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f18
        faddd   %f6,%f18,%f6
        st      %f6,[%fp+tmp7]

        fand    %f6,DC0,%f16            ! (2_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (2_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (2_0) hx >>= 10;

        sub     %o7,537,%o7
        and     %o2,2040,%o2            ! (2_0) hx &= 0x7f8;
        ba      .cont5
        for     %f16,DC1,%f28           ! (2_0) res = vis_for(res,DC1);
1:
        sub     %l6,stridex,tmp_px
        sub     counter,3,tmp_counter

        ba      .cont5
        mov     3,counter

        .align  16
.update6:
        cmp     counter,4
        ble     .cont6
        nop

        sub     %l6,stridex,tmp_px
        sub     counter,4,tmp_counter

        ba      .cont6
        mov     4,counter

        .align  16
.update7:
        sub     %l6,stridex,%i1
        cmp     counter,4
        ble     .cont7
        faddd   %f34,K3,%f6             ! (6_1) res += K3;

        ld      [%i1+4],%i3
        cmp     %g1,0
        bl      1f

        orcc    %g1,%i3,%g0
        bz      1f
        sethi   %hi(0x00080000),%i5

        cmp     %g1,%i5
        bge,a   2f
        ldd     [%o3+0x50],%f18

        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f16            ! (3_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (3_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (3_0) hx >>= 10;

        sub     %o7,537,%o7
        and     %o2,2040,%o2            ! (3_0) hx &= 0x7f8;
        ba      .cont7
        for     %f16,DC1,%f44           ! (3_0) res = vis_for(res,DC1);
2:
        fand    %f0,%f18,%f0
        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f18
        faddd   %f0,%f18,%f0
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f16            ! (3_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (3_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (3_0) hx >>= 10;

        sub     %o7,537,%o7
        and     %o2,2040,%o2            ! (3_0) hx &= 0x7f8;
        ba      .cont7
        for     %f16,DC1,%f44           ! (3_0) res = vis_for(res,DC1);
1:
        sub     %l6,stridex,tmp_px
        sub     counter,4,tmp_counter

        ba      .cont7
        mov     4,counter

        .align  16
.update8:
        cmp     counter,5
        ble     .cont8
        nop

        mov     %l6,tmp_px
        sub     counter,5,tmp_counter

        ba      .cont8
        mov     5,counter

        .align  16
.update9:
        ld      [%l6+4],%i3
        cmp     counter,5
        ble     .cont9
        fand    %f0,DC0,%f16            ! (5_0) res = vis_fand(res,DC0);

        cmp     %g1,0
        bl      1f

        orcc    %g1,%i3,%g0
        bz      1f
        sethi   %hi(0x00080000),%i1

        cmp     %g1,%i1
        bge,a   2f
        ldd     [%o3+0x50],%f18

        fxtod   %f8,%f8                 ! res = *(long long*)&res;
        st      %f8,[%fp+tmp7]

        fand    %f8,DC0,%f24            ! (4_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (4_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (4_0) hx >>= 10;

        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (4_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (4_0) iexp = -iexp;
        ba      .cont9
        for     %f24,DC1,%f24           ! (4_0) res = vis_for(res,DC1);
2:
        fand    %f8,%f18,%f8
        fxtod   %f8,%f8                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f18
        faddd   %f8,%f18,%f8
        st      %f8,[%fp+tmp7]

        fand    %f8,DC0,%f24            ! (4_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (4_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (4_0) hx >>= 10;

        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (4_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (4_0) iexp = -iexp;
        ba      .cont9
        for     %f24,DC1,%f24           ! (4_0) res = vis_for(res,DC1);
1:
        mov     %l6,tmp_px
        sub     counter,5,tmp_counter

        ba      .cont9
        mov     5,counter

        .align  16
.update10:
        cmp     counter,6
        ble     .cont10
        nop

        mov     %i0,tmp_px
        sub     counter,6,tmp_counter

        ba      .cont10
        mov     6,counter

        .align  16
.update11:
        ld      [%i0+4],%i3
        cmp     counter,6
        ble     .cont11
        fand    %f6,DC0,%f16            ! (6_0) res = vis_fand(res,DC0);

        cmp     %g1,0
        bl      1f

        orcc    %g1,%i3,%g0
        bz      1f
        sethi   %hi(0x00080000),%i3

        cmp     %g1,%i3
        bge,a   2f
        ldd     [%o3+0x50],%f18

        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f28            ! (5_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (5_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (5_0) hx >>= 10;

        sub     %o7,537,%o7

        sub     %g0,%o7,%o7             ! (5_0) iexp = -iexp;

        and     %o2,2040,%o2            ! (5_0) hx &= 0x7f8;
        ba      .cont11
        for     %f28,DC1,%f28           ! (5_0) res = vis_for(res,DC1);
2:
        fand    %f0,%f18,%f0
        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f18
        faddd   %f0,%f18,%f0
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f28            ! (5_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (5_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (5_0) hx >>= 10;

        sub     %o7,537,%o7

        sub     %g0,%o7,%o7             ! (5_0) iexp = -iexp;

        and     %o2,2040,%o2            ! (5_0) hx &= 0x7f8;
        ba      .cont11
        for     %f28,DC1,%f28           ! (5_0) res = vis_for(res,DC1);
1:
        mov     %i0,tmp_px
        sub     counter,6,tmp_counter

        ba      .cont11
        mov     6,counter

        .align  16
.update12:
        cmp     counter,0
        ble     .cont12
        faddd   %f34,K3,%f34            ! (2_1) res += K3;

        sub     %l6,stridex,tmp_px
        sub     counter,0,tmp_counter

        ba      .cont12
        mov     0,counter

        .align  16
.update13:
        sub     %l6,stridex,%l4
        cmp     counter,0
        ble     .cont13
        fpadd32 %f44,DC2,%f18           ! (6_1) res_c = vis_fpadd32(res,DC2);

        ld      [%l4+4],%l4
        cmp     %g1,0
        bl      1f

        orcc    %g1,%l4,%g0
        bz      1f
        sethi   %hi(0x00080000),%l4

        cmp     %g1,%l4
        bge,a   2f
        ldd     [%o3+0x50],%f62

        fxtod   %f6,%f6                 ! res = *(long long*)&res;
        st      %f6,[%fp+tmp7]

        fand    %f6,DC0,%f44            ! (6_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (6_1) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (6_1) hx >>= 10;

        sub     %o7,537,%o7
        and     %o2,2040,%o2            ! (6_1) hx &= 0x7f8;
        for     %f44,DC1,%f44           ! (6_1) res = vis_for(res,DC1);

        sub     %g0,%o7,%o7             ! (6_1) iexp = -iexp;
        ba      .cont13
        fpadd32 %f44,DC2,%f18           ! (6_1) res_c = vis_fpadd32(res,DC2);
2:
        fand    %f6,%f62,%f6
        fxtod   %f6,%f6                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f62
        faddd   %f6,%f62,%f6
        st      %f6,[%fp+tmp7]

        fand    %f6,DC0,%f44            ! (6_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (6_1) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (6_1) hx >>= 10;
        for     %f44,DC1,%f44           ! (6_1) res = vis_for(res,DC1);

        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (6_1) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (6_1) iexp = -iexp;
        ba      .cont13
        fpadd32 %f44,DC2,%f18           ! (6_1) res_c = vis_fpadd32(res,DC2);
1:
        sub     %l6,stridex,tmp_px
        sub     counter,0,tmp_counter

        ba      .cont13
        mov     0,counter

        .align  16
.update14:
        cmp     counter,1
        ble     .cont14
        faddd   %f34,K3,%f34            ! (3_1) res += K3;

        sub     %l6,stridex,tmp_px
        sub     counter,1,tmp_counter

        ba      .cont14
        mov     1,counter

        .align  16
.update15:
        sub     %l6,stridex,%l2
        cmp     counter,1
        ble     .cont15
        fpadd32 %f28,DC2,%f18           ! (0_0) res_c = vis_fpadd32(res,DC2);

        ld      [%l2+4],%l2
        cmp     %g1,0
        bl      1f

        orcc    %g1,%l2,%g0
        bz      1f
        sethi   %hi(0x00080000),%l2

        cmp     %g1,%l2
        bge,a   2f
        ldd     [%o3+0x50],%f62

        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f18            ! (0_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (0_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (0_0) hx >>= 10;

        sub     %o7,537,%o7
        for     %f18,DC1,%f28           ! (0_0) res = vis_for(res,DC1);

        sub     %g0,%o7,%o7             ! (0_0) iexp = -iexp;

        and     %o2,2040,%o2            ! (0_0) hx &= 0x7f8;
        add     %o7,1534,%o7            ! (0_0) iexp += 0x5fe;
        ba      .cont15
        fpadd32 %f28,DC2,%f18           ! (0_0) res_c = vis_fpadd32(res,DC2);
2:
        fand    %f0,%f62,%f0
        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f62
        faddd   %f0,%f62,%f0
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f18            ! (0_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (0_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (0_0) hx >>= 10;
        for     %f18,DC1,%f28           ! (0_0) res = vis_for(res,DC1);

        sub     %o7,537,%o7

        sub     %g0,%o7,%o7             ! (0_0) iexp = -iexp;

        and     %o2,2040,%o2            ! (0_0) hx &= 0x7f8;
        add     %o7,1534,%o7            ! (0_0) iexp += 0x5fe;
        ba      .cont15
        fpadd32 %f28,DC2,%f18           ! (0_0) res_c = vis_fpadd32(res,DC2);
1:
        sub     %l6,stridex,tmp_px
        sub     counter,1,tmp_counter

        ba      .cont15
        mov     1,counter

        .align  16
.update16:
        cmp     counter,2
        ble     .cont16
        fand    %f18,DC3,%f8            ! (0_0) res_c = vis_fand(res_c,DC3);

        sub     %l6,stridex,tmp_px
        sub     counter,2,tmp_counter

        ba      .cont16
        mov     2,counter

        .align  16
.update17:
        sub     %l6,stridex,%i2
        cmp     counter,2
        ble     .cont17
        fand    %f0,DC0,%f16            ! (2_0) res = vis_fand(res,DC0);

        ld      [%i2+4],%i2
        cmp     %g1,0
        bl      1f

        orcc    %g1,%i2,%g0
        bz      1f
        sethi   %hi(0x00080000),%i2

        cmp     %g1,%i2
        bge,a   2f
        ldd     [%o3+0x50],%f2

        fxtod   %f6,%f6                 ! res = *(long long*)&res;
        st      %f6,[%fp+tmp7]

        fand    %f6,DC0,%f44            ! (1_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (1_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (1_0) hx >>= 10;

        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (1_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (1_0) iexp = -iexp;
        ba      .cont17
        for     %f44,DC1,%f44           ! (1_0) res = vis_for(res,DC1);
2:
        fand    %f6,%f2,%f6
        fxtod   %f6,%f6                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f2
        faddd   %f6,%f2,%f6
        st      %f6,[%fp+tmp7]

        fand    %f6,DC0,%f44            ! (1_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (1_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (1_0) hx >>= 10;

        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (1_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (1_0) iexp = -iexp;
        ba      .cont17
        for     %f44,DC1,%f44           ! (1_0) res = vis_for(res,DC1);
1:
        sub     %l6,stridex,tmp_px
        sub     counter,2,tmp_counter

        ba      .cont17
        mov     2,counter

        .align  16
.update18:
        cmp     counter,3
        ble     .cont18
        fand    %f18,DC3,%f8            ! (1_0) res_c = vis_fand(res_c,DC3);

        sub     %l6,stridex,tmp_px
        sub     counter,3,tmp_counter

        ba      .cont18
        mov     3,counter

        .align  16
.update19:
        sub     %l6,stridex,%i4
        cmp     counter,3
        ble     .cont19
        fand    %f6,DC0,%f16            ! (3_0) res = vis_fand(res,DC0);

        ld      [%i4+4],%i4
        cmp     %g1,0
        bl      1f

        orcc    %g1,%i4,%g0
        bz      1f
        sethi   %hi(0x00080000),%i4

        cmp     %g1,%i4
        bge,a   2f
        ldd     [%o3+0x50],%f2

        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f28            ! (2_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (2_0) iexp = hx >> 21;

        sra     %g1,10,%o2              ! (2_0) hx >>= 10;
        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (2_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (2_0) iexp = -iexp;
        ba      .cont19
        for     %f28,DC1,%f28           ! (2_0) res = vis_for(res,DC1);
2:
        fand    %f0,%f2,%f0
        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f2
        faddd   %f0,%f2,%f0
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f28            ! (2_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (2_0) iexp = hx >> 21;

        sra     %g1,10,%o2              ! (2_0) hx >>= 10;
        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (2_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (2_0) iexp = -iexp;
        ba      .cont19
        for     %f28,DC1,%f28           ! (2_0) res = vis_for(res,DC1);
1:
        sub     %l6,stridex,tmp_px
        sub     counter,3,tmp_counter

        ba      .cont19
        mov     3,counter

        .align  16
.update20:
        cmp     counter,4
        ble     .cont20
        fand    %f18,DC3,%f4            ! (2_0) res_c = vis_fand(res_c,DC3);

        sub     %l6,stridex,tmp_px
        sub     counter,4,tmp_counter

        ba      .cont20
        mov     4,counter

        .align  16
.update21:
        sub     %l6,stridex,%i5
        cmp     counter,4
        ble     .cont21
        fand    %f0,DC0,%f16            ! (4_0) res = vis_fand(res,DC0);

        ld      [%i5+4],%i5
        cmp     %g1,0
        bl      1f

        orcc    %g1,%i5,%g0
        bz      1f
        sethi   %hi(0x00080000),%i5

        cmp     %g1,%i5
        bge,a   2f
        ldd     [%o3+0x50],%f34

        fxtod   %f6,%f6                 ! res = *(long long*)&res;
        st      %f6,[%fp+tmp7]

        fand    %f6,DC0,%f44            ! (3_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (3_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (3_0) hx >>= 10;

        sub     %o7,537,%o7
        and     %o2,2040,%o2            ! (3_0) hx &= 0x7f8;

        sub     %g0,%o7,%o7             ! (3_0) iexp = -iexp;
        ba      .cont21
        for     %f44,DC1,%f44           ! (3_0) res = vis_for(res,DC1);
2:
        fand    %f6,%f34,%f6
        fxtod   %f6,%f6                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f34
        faddd   %f6,%f34,%f6
        st      %f6,[%fp+tmp7]

        fand    %f6,DC0,%f44            ! (3_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (3_0) iexp = hx >> 21;
        sra     %g1,10,%o2              ! (3_0) hx >>= 10;

        sub     %o7,537,%o7
        and     %o2,2040,%o2            ! (3_0) hx &= 0x7f8;

        sub     %g0,%o7,%o7             ! (3_0) iexp = -iexp;
        ba      .cont21
        for     %f44,DC1,%f44           ! (3_0) res = vis_for(res,DC1);
1:
        sub     %l6,stridex,tmp_px
        sub     counter,4,tmp_counter

        ba      .cont21
        mov     4,counter

        .align  16
.update22:
        cmp     counter,5
        ble     .cont22
        fmuld   %f62,%f38,%f62          ! (1_0) res *= xx;

        sub     %i0,stridex,tmp_px
        sub     counter,5,tmp_counter

        ba      .cont22
        mov     5,counter

        .align  16
.update23:
        sub     %i0,stridex,%l1
        cmp     counter,5
        ble     .cont23
        fand    %f6,DC0,%f16            ! (5_0) res = vis_fand(res,DC0);

        ld      [%l1+4],%l1
        cmp     %g1,0
        bl      1f

        orcc    %g1,%l1,%g0
        bz      1f
        sethi   %hi(0x00080000),%l1

        cmp     %g1,%l1
        bge,a   2f
        ldd     [%o3+0x50],%f34

        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f24            ! (4_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (4_0) iexp = hx >> 21;

        sra     %g1,10,%o2              ! (4_0) hx >>= 10;
        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (4_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (4_0) iexp = -iexp;
        ba      .cont23
        for     %f24,DC1,%f24           ! (4_0) res = vis_for(res,DC1);
2:
        fand    %f0,%f34,%f0
        fxtod   %f0,%f0                 ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f34
        faddd   %f0,%f34,%f0
        st      %f0,[%fp+tmp7]

        fand    %f0,DC0,%f24            ! (4_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (4_0) iexp = hx >> 21;

        sra     %g1,10,%o2              ! (4_0) hx >>= 10;
        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (4_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (4_0) iexp = -iexp;
        ba      .cont23
        for     %f24,DC1,%f24           ! (4_0) res = vis_for(res,DC1);
1:
        sub     %i0,stridex,tmp_px
        sub     counter,5,tmp_counter

        ba      .cont23
        mov     5,counter

        .align  16
.update24:
        cmp     counter,6
        ble     .cont24
        fmuld   %f62,%f36,%f62          ! (2_0) res *= xx;

        sub     %i1,stridex,tmp_px
        sub     counter,6,tmp_counter

        ba      .cont24
        mov     6,counter

        .align  16
.update25:
        sub     %i1,stridex,%i3
        cmp     counter,6
        ble     .cont25
        fand    %f6,DC0,%f16            ! (6_0) res = vis_fand(res,DC0);

        ld      [%i3+4],%i3
        cmp     %g1,0
        bl      1f

        orcc    %g1,%i3,%g0
        bz      1f
        nop

        sub     %i1,stridex,%i3
        ld      [%i3],%f10
        ld      [%i3+4],%f11

        sethi   %hi(0x00080000),%i3

        cmp     %g1,%i3
        bge,a   2f
        ldd     [%o3+0x50],%f60

        fxtod   %f10,%f10               ! res = *(long long*)&res;
        st      %f10,[%fp+tmp7]

        fand    %f10,DC0,%f28           ! (5_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (5_0) iexp = hx >> 21;

        sra     %g1,10,%o2              ! (5_0) hx >>= 10;
        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (5_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (5_0) iexp = -iexp;

        ba      .cont25
        for     %f28,DC1,%f28           ! (5_0) res = vis_for(res,DC1);
2:
        fand    %f10,%f60,%f10
        fxtod   %f10,%f10               ! res = *(long long*)&res;
        ldd     [%o3+0x58],%f60
        faddd   %f10,%f60,%f10
        st      %f10,[%fp+tmp7]

        fand    %f10,DC0,%f28           ! (5_0) res = vis_fand(res,DC0);
        ld      [%fp+tmp7],%g1

        sra     %g1,21,%o7              ! (5_0) iexp = hx >> 21;

        sra     %g1,10,%o2              ! (5_0) hx >>= 10;
        sub     %o7,537,%o7

        and     %o2,2040,%o2            ! (5_0) hx &= 0x7f8;
        sub     %g0,%o7,%o7             ! (5_0) iexp = -iexp;

        ba      .cont25
        for     %f28,DC1,%f28           ! (5_0) res = vis_for(res,DC1);
1:
        sub     %i1,stridex,tmp_px
        sub     counter,6,tmp_counter

        ba      .cont25
        mov     6,counter

.exit:
        ret
        restore
        SET_SIZE(__vrsqrt)