root/usr/src/lib/libmvec/common/vis/__vhypot.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

        .file   "__vhypot.S"

#include "libm.h"

        RO_DATA
        .align  64

.CONST_TBL:
        .word   0x7ff00000, 0   ! DC0
        .word   0x7fe00000, 0   ! DC1
        .word   0x00100000, 0   ! DC2
        .word   0x41b00000, 0   ! D2ON28 = 268435456.0
        .word   0x7fd00000, 0   ! DC3

#define counter         %i0
#define tmp_counter     %l3
#define tmp_px          %l5
#define tmp_py          %o7
#define stridex         %i2
#define stridey         %i4
#define stridez         %l0

#define DC0             %f8
#define DC0_HI          %f8
#define DC0_LO          %f9
#define DC1             %f46
#define DC2             %f48
#define DC3             %f0
#define D2ON28          %f62

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!      !!!!!   algorithm   !!!!!
!  ((float*)&x)[0] = ((float*)px)[0];
!  ((float*)&x)[1] = ((float*)px)[1];
!
!  ((float*)&y)[0] = ((float*)py)[0];
!  ((float*)&y)[1] = ((float*)py)[1];
!
!  x = fabs(x);
!  y = fabs(y);
!
!  c0 = vis_fcmple32(DC1,x);
!  c2 = vis_fcmple32(DC1,y);
!  c1 = vis_fcmpgt32(DC2,x);
!  c3 = vis_fcmpgt32(DC2,y);
!
!  c0 |= c2;
!  c1 &= c3;
!  if ( (c0 & 2) != 0 )
!  {
!    lx = ((int*)px)[1];
!    ly = ((int*)py)[1];
!    hx = *(int*)px;
!    hy = *(int*)py;
!
!    hx &= 0x7fffffff;
!    hy &= 0x7fffffff;
!
!    j0 = hx;
!    if ( j0 < hy ) j0 = hy;
!    j0 &= 0x7ff00000;
!    if ( j0 >= 0x7ff00000 )
!    {
!      if ( hx == 0x7ff00000 && lx == 0 ) res = x == y ? y : x;
!      else if ( hy == 0x7ff00000 && ly == 0 ) res = x == y ? x : y;
!      else res = x * y;
!
!      ((float*)pz)[0] = ((float*)&res)[0];
!      ((float*)pz)[1] = ((float*)&res)[1];
!    }
!    else
!    {
!      diff = hy - hx;
!      j0 = diff >> 31;
!      if ( ((diff ^ j0) - j0) < 0x03600000 )
!      {!
!        x *= D2ONM1022;
!        y *= D2ONM1022;
!
!        x_hi = ( x + two28 ) - two28;
!        x_lo = x - x_hi;
!        y_hi = ( y + two28 ) - two28;
!        y_lo = y - y_hi;
!        res = (x_hi * x_hi + y_hi * y_hi);
!        res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
!
!        res = sqrt(res);
!
!        res = D2ONP1022 * res;
!        ((float*)pz)[0] = ((float*)&res)[0];
!        ((float*)pz)[1] = ((float*)&res)[1];
!      }
!      else
!      {
!        res = x + y;
!        ((float*)pz)[0] = ((float*)&res)[0];
!        ((float*)pz)[1] = ((float*)&res)[1];
!      }
!    }
!    px += stridex;
!    py += stridey;
!    pz += stridez;
!    continue;
!  }
!  if ( (c1 & 2) != 0 )
!  {
!    x *= D2ONP1022;
!    y *= D2ONP1022;
!
!    x_hi = ( x + two28 ) - two28;
!    x_lo = x - x_hi;
!    y_hi = ( y + two28 ) - two28;
!    y_lo = y - y_hi;
!    res = (x_hi * x_hi + y_hi * y_hi);
!    res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
!
!    res = sqrt(res);
!
!    res = D2ONM1022 * res;
!    ((float*)pz)[0] = ((float*)&res)[0];
!    ((float*)pz)[1] = ((float*)&res)[1];
!    px += stridex;
!    py += stridey;
!    pz += stridez;
!    continue;
!  }
!
!  dmax = x;
!  if ( dmax < y ) dmax = y;
!
!  dmax = vis_fand(dmax,DC0);
!  dnorm = vis_fpsub32(DC1,dmax);
!
!  x *= dnorm;
!  y *= dnorm;
!
!  x_hi = x + D2ON28;
!  x_hi -= D2ON28;
!  x_lo = x - x_hi;
!
!  y_hi = y + D2ON28;
!  y_hi -= D2ON28;
!  y_lo = y - y_hi;
!
!  res = x_hi * x_hi;
!  dtmp1 = x + x_hi;
!  dtmp0 = y_hi * y_hi;
!  dtmp2 = y + y_hi;
!
!  res += dtmp0;
!  dtmp1 *= x_lo;
!  dtmp2 *= y_lo;
!  dtmp1 += dtmp2;
!  res += dtmp1;
!
!  res = sqrt(res);
!
!  res = dmax * res;
!  ((float*)pz)[0] = ((float*)&res)[0];
!  ((float*)pz)[1] = ((float*)&res)[1];
!
!  px += stridex;
!  py += stridey;
!  pz += stridez;
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        ENTRY(__vhypot)
        save    %sp,-SA(MINFRAME),%sp
        PIC_SETUP(l7)
        PIC_SET(l7,.CONST_TBL,o3)
        wr      %g0,0x82,%asi

#ifdef __sparcv9
        ldx     [%fp+STACK_BIAS+176],%l0
#else
        ld      [%fp+STACK_BIAS+92],%l0
#endif
        ldd     [%o3],DC0
        sll     %i2,3,stridex
        mov     %i0,tmp_counter

        ldd     [%o3+8],DC1
        sll     %i4,3,stridey
        mov     %i1,tmp_px

        ldd     [%o3+16],DC2
        sll     %l0,3,stridez
        mov     %i3,tmp_py

        ldd     [%o3+24],D2ON28

        ldd     [%o3+32],DC3

.begin:
        mov     tmp_counter,counter
        mov     tmp_px,%i1
        mov     tmp_py,%i3
        clr     tmp_counter
.begin1:
        cmp     counter,0
        ble,pn  %icc,.exit
        nop

        lda     [%i1]%asi,%o0
        sethi   %hi(0x7ffffc00),%o5

        lda     [%i3]%asi,%o2
        add     %o5,1023,%o5

        lda     [%i1]%asi,%f26          ! (1_0) ((float*)&x)[0] = ((float*)px)[0];

        lda     [%i1+4]%asi,%f27        ! (1_0) ((float*)&x)[1] = ((float*)px)[1];
        add     %i1,stridex,%o1         ! px += stridex

        lda     [%i3]%asi,%f24          ! (1_0) ((float*)&y)[0] = ((float*)py)[0];
        sethi   %hi(0x00100000),%l7
        and     %o0,%o5,%o0

        lda     [%i3+4]%asi,%f25        ! (1_0) ((float*)&y)[1] = ((float*)py)[1];
        and     %o2,%o5,%o2
        sethi   %hi(0x7fe00000),%l6

        fabsd   %f26,%f36               ! (1_0) x = fabs(x);
        cmp     %o0,%o2
        mov     %o2,%l4

        fabsd   %f24,%f54               ! (1_0) y = fabs(y);
        add     %i3,stridey,%o5         ! py += stridey
        movg    %icc,%o0,%o2
        lda     [%o5]%asi,%f28          ! (2_0) ((float*)&y)[0] = ((float*)py)[0];

        cmp     %o2,%l6
        sethi   %hi(0x7ff00000),%o4
        bge,pn  %icc,.spec0
        lda     [%o5+4]%asi,%f29        ! (2_0) ((float*)&y)[1] = ((float*)py)[1];

        cmp     %o2,%l7
        bl,pn   %icc,.spec1
        nop
        lda     [%o1]%asi,%f26          ! (2_0) ((float*)&x)[0] = ((float*)px)[0];

        lda     [%o1+4]%asi,%f27        ! (2_0) ((float*)&x)[1] = ((float*)px)[1];
        add     %i3,stridey,%i3         ! py += stridey

        fabsd   %f28,%f34               ! (2_0) y = fabs(y);

        fabsd   %f26,%f50               ! (2_0) x = fabs(x);

        fcmple32        DC1,%f50,%o3    ! (2_0) c0 = vis_fcmple32(DC1,x);

        fcmple32        DC1,%f34,%o0    ! (2_0) c2 = vis_fcmple32(DC1,y);

        fcmpgt32        DC2,%f50,%o4    ! (2_0) c1 = vis_fcmpgt32(DC2,x);

        fcmpgt32        DC2,%f34,%o5    ! (2_0) c3 = vis_fcmpgt32(DC2,y);

        or      %o3,%o0,%o3             ! (2_0) c0 |= c2;

        andcc   %o3,2,%g0               ! (2_0) c0 & 2
        bnz,pn  %icc,.update0           ! (2_0) if ( (c0 & 2) != 0 )
        and     %o4,%o5,%o4             ! (2_0) c1 &= c3;
.cont0:
        add     %i3,stridey,%l4         ! py += stridey
        andcc   %o4,2,%g0               ! (2_0) c1 & 2
        bnz,pn  %icc,.update1           ! (2_0) if ( (c1 & 2) != 0 )
        fmovd   %f36,%f56               ! (1_0) dmax = x;
.cont1:
        lda     [%l4]%asi,%f30          ! (3_0) ((float*)&y)[0] = ((float*)py)[0];
        add     %o1,stridex,%l2         ! px += stridex

        lda     [%l4+4]%asi,%f31        ! (3_0) ((float*)&y)[1] = ((float*)py)[1];

        lda     [%l2]%asi,%f18          ! (3_1) ((float*)&x)[0] = ((float*)px)[0];

        lda     [%l2+4]%asi,%f19        ! (3_1) ((float*)&x)[1] = ((float*)px)[1];

        fabsd   %f30,%f30               ! (3_1) y = fabs(y);

        fabsd   %f18,%f18               ! (3_1) x = fabs(x);

        fcmped  %fcc2,%f54,%f56         ! (1_1) dmax ? y

        fmovdg  %fcc2,%f54,%f56         ! (1_1) if ( dmax < y ) dmax = y;

        fcmple32        DC1,%f18,%o3    ! (3_1) c0 = vis_fcmple32(DC1,x);

        fcmple32        DC1,%f30,%o0    ! (3_1) c2 = vis_fcmple32(DC1,y);

        fcmpgt32        DC2,%f18,%o4    ! (3_1) c1 = vis_fcmpgt32(DC2,x);

        fcmpgt32        DC2,%f30,%o1    ! (3_1) c3 = vis_fcmpgt32(DC2,y);

        fand    %f56,DC0,%f38           ! (1_1) dmax = vis_fand(dmax,DC0);

        or      %o3,%o0,%o3             ! (3_1) c0 |= c2;

        andcc   %o3,2,%g0               ! (3_1) c0 & 2
        bnz,pn  %icc,.update2           ! (3_1) if ( (c0 & 2) != 0 )
        and     %o4,%o1,%o4             ! (3_1) c1 &= c3;
.cont2:
        add     %l4,stridey,%i3         ! py += stridey
        andcc   %o4,2,%g0               ! (3_1) c1 & 2
        bnz,pn  %icc,.update3           ! (3_1) if ( (c1 & 2) != 0 )
        fmovd   %f50,%f32               ! (2_1) dmax = x;
.cont3:
        fpsub32 DC1,%f38,%f10           ! (1_1) dnorm = vis_fpsub32(DC1,dmax);
        lda     [%i3]%asi,%f20          ! (0_0) ((float*)&y)[0] = ((float*)py)[0];

        lda     [%i3+4]%asi,%f21        ! (0_0) ((float*)&y)[1] = ((float*)py)[1];

        add     %l2,stridex,%l1         ! px += stridex

        fmuld   %f36,%f10,%f36          ! (1_1) x *= dnorm;
        lda     [%l1]%asi,%f22          ! (0_0) ((float*)&x)[0] = ((float*)px)[0]

        lda     [%l1+4]%asi,%f23        ! (0_0) ((float*)&x)[1] = ((float*)px)[1];

        fmuld   %f54,%f10,%f56          ! (1_1) y *= dnorm;
        fabsd   %f20,%f40               ! (0_0) y = fabs(y);

        fabsd   %f22,%f20               ! (0_0) x = fabs(x);

        fcmped  %fcc3,%f34,%f32         ! (2_1) dmax ? y


        fmovdg  %fcc3,%f34,%f32         ! (2_1) if ( dmax < y ) dmax = y;

        faddd   %f36,D2ON28,%f58        ! (1_1) x_hi = x + D2ON28;
        fcmple32        DC1,%f20,%g5    ! (0_0) c0 = vis_fcmple32(DC1,x);

        faddd   %f56,D2ON28,%f22        ! (1_1) y_hi = y + D2ON28;
        fcmple32        DC1,%f40,%o2    ! (0_0) c2 = vis_fcmple32(DC1,y);

        fcmpgt32        DC2,%f20,%g1    ! (0_0) c1 = vis_fcmpgt32(DC2,x);

        fcmpgt32        DC2,%f40,%o4    ! (0_0) c3 = vis_fcmpgt32(DC2,y);

        fand    %f32,DC0,%f52           ! (2_1) dmax = vis_fand(dmax,DC0);

        or      %g5,%o2,%g5             ! (0_0) c0 |= c2;
        fsubd   %f58,D2ON28,%f58        ! (1_1) x_hi -= D2ON28;

        andcc   %g5,2,%g0               ! (0_0) c0 & 2
        bnz,pn  %icc,.update4           ! (0_0) if ( (c0 & 2) != 0 )
        fsubd   %f22,D2ON28,%f22        ! (1_1) y_hi -= D2ON28;
.cont4:
        and     %g1,%o4,%g1             ! (0_0) c1 &= c3;

        add     %i3,stridey,%l2         ! py += stridey
        andcc   %g1,2,%g0               ! (0_0) c1 & 2
        bnz,pn  %icc,.update5           ! (0_0) if ( (c1 & 2) != 0 )
        fmovd   %f18,%f44               ! (3_1) dmax = x;
.cont5:
        fpsub32 DC1,%f52,%f10           ! (2_1) dnorm = vis_fpsub32(DC1,dmax);
        lda     [%l2]%asi,%f24          ! (1_0) ((float*)&y)[0] = ((float*)py)[0];

        fmuld   %f58,%f58,%f60          ! (1_1) res = x_hi * x_hi;
        lda     [%l2+4]%asi,%f25        ! (1_0) ((float*)&y)[1] = ((float*)py)[1];
        add     %l1,stridex,%l7         ! px += stridex
        faddd   %f56,%f22,%f28          ! (1_1) dtmp2 = y + y_hi;

        faddd   %f36,%f58,%f6           ! (1_1) dtmp1 = x + x_hi;
        lda     [%l7]%asi,%f26          ! (1_0) ((float*)&x)[0] = ((float*)px)[0];

        fmuld   %f50,%f10,%f50          ! (2_1) x *= dnorm;
        fsubd   %f36,%f58,%f58          ! (1_1) x_lo = x - x_hi;
        lda     [%l7+4]%asi,%f27        ! (1_0) ((float*)&x)[1] = ((float*)px)[1];

        fmuld   %f22,%f22,%f2           ! (1_1) dtmp0 = y_hi * y_hi;
        fsubd   %f56,%f22,%f56          ! (1_1) y_lo = y - y_hi;

        fmuld   %f34,%f10,%f34          ! (2_1) y *= dnorm;
        fabsd   %f24,%f54               ! (1_0) y = fabs(y);

        fabsd   %f26,%f36               ! (1_0) x = fabs(x);

        fmuld   %f6,%f58,%f10           ! (1_1) dtmp1 *= x_lo;
        fcmped  %fcc0,%f30,%f44         ! (3_1) dmax ? y

        fmuld   %f28,%f56,%f26          ! (1_1) dtmp2 *= y_lo;

        fmovdg  %fcc0,%f30,%f44         ! (3_1) if ( dmax < y ) dmax = y;

        faddd   %f50,D2ON28,%f58        ! (2_1) x_hi = x + D2ON28;
        fcmple32        DC1,%f36,%g1    ! (1_0) c0 = vis_fcmple32(DC1,x);

        faddd   %f34,D2ON28,%f22        ! (2_1) y_hi = y + D2ON28;
        fcmple32        DC1,%f54,%g5    ! (1_0) c2 = vis_fcmple32(DC1,y);

        faddd   %f60,%f2,%f24           ! (1_1) res += dtmp0;
        fcmpgt32        DC2,%f36,%o5    ! (1_0) c1 = vis_fcmpgt32(DC2,x);

        faddd   %f10,%f26,%f28          ! (1_1) dtmp1 += dtmp2;
        fcmpgt32        DC2,%f54,%o1    ! (1_0) c3 = vis_fcmpgt32(DC2,y);

        fand    %f44,DC0,%f14           ! (3_1) dmax = vis_fand(dmax,DC0);

        or      %g1,%g5,%g1             ! (1_0) c0 |= c2;
        fsubd   %f58,D2ON28,%f44        ! (2_1) x_hi -= D2ON28;

        andcc   %g1,2,%g0               ! (1_0) c0 & 2
        bnz,pn  %icc,.update6           ! (1_0) if ( (c0 & 2) != 0 )
        fsubd   %f22,D2ON28,%f58        ! (2_1) y_hi -= D2ON28;
.cont6:
        and     %o5,%o1,%o5             ! (1_0) c1 &= c3;
        faddd   %f24,%f28,%f26          ! (1_1) res += dtmp1;

        add     %l2,stridey,%i3         ! py += stridey
        andcc   %o5,2,%g0               ! (1_0) c1 & 2
        bnz,pn  %icc,.update7           ! (1_0) if ( (c1 & 2) != 0 )
        fmovd   %f20,%f4                ! (0_0) dmax = x;
.cont7:
        fpsub32 DC1,%f14,%f10           ! (3_1) dnorm = vis_fpsub32(DC1,dmax);
        lda     [%i3]%asi,%f28          ! (2_0) ((float*)&y)[0] = ((float*)py)[0];

        fmuld   %f44,%f44,%f2           ! (2_1) res = x_hi * x_hi;
        lda     [%i3+4]%asi,%f29        ! (2_0) ((float*)&y)[1] = ((float*)py)[1];
        add     %l7,stridex,%o1         ! px += stridex
        faddd   %f34,%f58,%f60          ! (2_1) dtmp2 = y + y_hi;

        fsqrtd  %f26,%f24               ! (1_1) res = sqrt(res);
        lda     [%o1]%asi,%f26          ! (2_0) ((float*)&x)[0] = ((float*)px)[0];
        faddd   %f50,%f44,%f56          ! (2_1) dtmp1 = x + x_hi;

        fmuld   %f18,%f10,%f6           ! (3_1) x *= dnorm;
        fsubd   %f50,%f44,%f18          ! (2_1) x_lo = x - x_hi;
        lda     [%o1+4]%asi,%f27        ! (2_0) ((float*)&x)[1] = ((float*)px)[1];

        fmuld   %f58,%f58,%f44          ! (2_1) dtmp0 = y_hi * y_hi;
        fsubd   %f34,%f58,%f22          ! (2_1) y_lo = y - y_hi;

        fmuld   %f30,%f10,%f58          ! (3_1) y *= dnorm;
        fabsd   %f28,%f34               ! (2_0) y = fabs(y);

        fabsd   %f26,%f50               ! (2_0) x = fabs(x);

        fmuld   %f56,%f18,%f10          ! (2_1) dtmp1 *= x_lo;
        fcmped  %fcc1,%f40,%f4          ! (0_0) dmax ? y

        fmuld   %f60,%f22,%f12          ! (2_1) dtmp2 *= y_lo;

        fmovdg  %fcc1,%f40,%f4          ! (0_0) if ( dmax < y ) dmax = y;

        faddd   %f6,D2ON28,%f56         ! (3_1) x_hi = x + D2ON28;
        fcmple32        DC1,%f50,%o3    ! (2_0) c0 = vis_fcmple32(DC1,x);

        faddd   %f58,D2ON28,%f28        ! (3_1) y_hi = y + D2ON28;
        fcmple32        DC1,%f34,%o0    ! (2_0) c2 = vis_fcmple32(DC1,y);

        faddd   %f2,%f44,%f30           ! (2_1) res += dtmp0;
        fcmpgt32        DC2,%f50,%o4    ! (2_0) c1 = vis_fcmpgt32(DC2,x);

        faddd   %f10,%f12,%f26          ! (2_1) dtmp1 += dtmp2;
        fcmpgt32        DC2,%f34,%o5    ! (2_0) c3 = vis_fcmpgt32(DC2,y);

        fand    %f4,DC0,%f16            ! (0_0) dmax = vis_fand(dmax,DC0);

        or      %o3,%o0,%o3             ! (2_0) c0 |= c2;
        fsubd   %f56,D2ON28,%f18        ! (3_1) x_hi -= D2ON28;

        andcc   %o3,2,%g0               ! (2_0) c0 & 2
        bnz,pn  %icc,.update8           ! (2_0) if ( (c0 & 2) != 0 )
        fsubd   %f28,D2ON28,%f4         ! (3_1) y_hi -= D2ON28;
.cont8:
        and     %o4,%o5,%o4             ! (2_0) c1 &= c3;
        faddd   %f30,%f26,%f12          ! (2_1) res += dtmp1;

        add     %i3,stridey,%l4         ! py += stridey
        andcc   %o4,2,%g0               ! (2_0) c1 & 2
        bnz,pn  %icc,.update9           ! (2_0) if ( (c1 & 2) != 0 )
        fmovd   %f36,%f56               ! (1_0) dmax = x;
.cont9:
        lda     [%l4]%asi,%f30          ! (3_0) ((float*)&y)[0] = ((float*)py)[0];
        add     %o1,stridex,%l2         ! px += stridex
        fpsub32 DC1,%f16,%f44           ! (0_0) dnorm = vis_fpsub32(DC1,dmax);

        fmuld   %f18,%f18,%f60          ! (3_1) res = x_hi * x_hi;
        lda     [%l4+4]%asi,%f31        ! (3_0) ((float*)&y)[1] = ((float*)py)[1];
        faddd   %f58,%f4,%f32           ! (3_1) dtmp2 = y + y_hi;

        fsqrtd  %f12,%f12               ! (2_1) res = sqrt(res);
        faddd   %f6,%f18,%f28           ! (3_1) dtmp1 = x + x_hi;

        cmp     counter,4
        bl,pn   %icc,.tail
        nop

        ba      .main_loop
        sub     counter,4,counter

        .align  16
.main_loop:
        fmuld   %f20,%f44,%f2           ! (0_1) x *= dnorm;
        fsubd   %f6,%f18,%f20           ! (3_2) x_lo = x - x_hi;
        lda     [%l2]%asi,%f18          ! (3_1) ((float*)&x)[0] = ((float*)px)[0];

        fmuld   %f4,%f4,%f22            ! (3_2) dtmp0 = y_hi * y_hi;
        lda     [%l2+4]%asi,%f19        ! (3_1) ((float*)&x)[1] = ((float*)px)[1];
        fsubd   %f58,%f4,%f58           ! (3_2) y_lo = y - y_hi;

        fmuld   %f40,%f44,%f44          ! (0_1) y *= dnorm;
        fabsd   %f30,%f30               ! (3_1) y = fabs(y);

        fmuld   %f38,%f24,%f10          ! (1_2) res = dmax * res;
        fabsd   %f18,%f18               ! (3_1) x = fabs(x);
        st      %f10,[%i5]              ! (1_2) ((float*)pz)[0] = ((float*)&res)[0];

        fmuld   %f28,%f20,%f28          ! (3_2) dtmp1 *= x_lo;
        st      %f11,[%i5+4]            ! (1_2) ((float*)pz)[1] = ((float*)&res)[1];
        fcmped  %fcc2,%f54,%f56         ! (1_1) dmax ? y

        fmuld   %f32,%f58,%f24          ! (3_2) dtmp2 *= y_lo;

        fmovdg  %fcc2,%f54,%f56         ! (1_1) if ( dmax < y ) dmax = y;

        faddd   %f2,D2ON28,%f10         ! (0_1) x_hi = x + D2ON28;
        fcmple32        DC1,%f18,%o3    ! (3_1) c0 = vis_fcmple32(DC1,x);

        faddd   %f44,D2ON28,%f20        ! (0_1) y_hi = y + D2ON28;
        fcmple32        DC1,%f30,%o0    ! (3_1) c2 = vis_fcmple32(DC1,y);

        faddd   %f60,%f22,%f22          ! (3_2) res += dtmp0;
        fcmpgt32        DC2,%f18,%o4    ! (3_1) c1 = vis_fcmpgt32(DC2,x);

        faddd   %f28,%f24,%f26          ! (3_2) dtmp1 += dtmp2;
        fcmpgt32        DC2,%f30,%o1    ! (3_1) c3 = vis_fcmpgt32(DC2,y);

        fand    %f56,DC0,%f38           ! (1_1) dmax = vis_fand(dmax,DC0);

        or      %o3,%o0,%o3             ! (3_1) c0 |= c2;
        fsubd   %f10,D2ON28,%f58        ! (0_1) x_hi -= D2ON28;

        andcc   %o3,2,%g0               ! (3_1) c0 & 2
        bnz,pn  %icc,.update10          ! (3_1) if ( (c0 & 2) != 0 )
        fsubd   %f20,D2ON28,%f56        ! (0_1) y_hi -= D2ON28;
.cont10:
        faddd   %f22,%f26,%f28          ! (3_2) res += dtmp1;
        and     %o4,%o1,%o4             ! (3_1) c1 &= c3;

        add     %l4,stridey,%i3         ! py += stridey
        andcc   %o4,2,%g0               ! (3_1) c1 & 2
        bnz,pn  %icc,.update11          ! (3_1) if ( (c1 & 2) != 0 )
        fmovd   %f50,%f32               ! (2_1) dmax = x;
.cont11:
        fpsub32 DC1,%f38,%f10           ! (1_1) dnorm = vis_fpsub32(DC1,dmax);
        add     %l2,stridex,%l1         ! px += stridex
        lda     [%i3]%asi,%f20          ! (0_0) ((float*)&y)[0] = ((float*)py)[0];

        fmuld   %f58,%f58,%f6           ! (0_1) res = x_hi * x_hi;
        lda     [%i3+4]%asi,%f21        ! (0_0) ((float*)&y)[1] = ((float*)py)[1];
        add     %i5,stridez,%l6         ! pz += stridez
        faddd   %f44,%f56,%f60          ! (0_1) dtmp2 = y + y_hi;

        fsqrtd  %f28,%f4                ! (3_2) res = sqrt(res);
        lda     [%l1]%asi,%f22          ! (0_0) ((float*)&x)[0] = ((float*)px)[0];
        faddd   %f2,%f58,%f24           ! (0_1) dtmp1 = x + x_hi;

        fmuld   %f36,%f10,%f36          ! (1_1) x *= dnorm;
        fsubd   %f2,%f58,%f26           ! (0_1) x_lo = x - x_hi;
        lda     [%l1+4]%asi,%f23        ! (0_0) ((float*)&x)[1] = ((float*)px)[1];

        fmuld   %f56,%f56,%f28          ! (0_1) dtmp0 = y_hi * y_hi;
        fsubd   %f44,%f56,%f44          ! (0_1) y_lo = y - y_hi;

        fmuld   %f54,%f10,%f56          ! (1_1) y *= dnorm;
        fabsd   %f20,%f40               ! (0_0) y = fabs(y);

        fmuld   %f52,%f12,%f12          ! (2_2) res = dmax * res;
        fabsd   %f22,%f20               ! (0_0) x = fabs(x);
        st      %f12,[%l6]              ! (2_2) ((float*)pz)[0] = ((float*)&res)[0];

        fmuld   %f24,%f26,%f10          ! (0_1) dtmp1 *= x_lo;
        st      %f13,[%l6+4]            ! (2_2) ((float*)pz)[1] = ((float*)&res)[1];
        fcmped  %fcc3,%f34,%f32         ! (2_1) dmax ? y

        fmuld   %f60,%f44,%f12          ! (0_1) dtmp2 *= y_lo;

        fmovdg  %fcc3,%f34,%f32         ! (2_1) if ( dmax < y ) dmax = y;

        faddd   %f36,D2ON28,%f58        ! (1_1) x_hi = x + D2ON28;
        fcmple32        DC1,%f20,%g5    ! (0_0) c0 = vis_fcmple32(DC1,x);

        faddd   %f56,D2ON28,%f22        ! (1_1) y_hi = y + D2ON28;
        fcmple32        DC1,%f40,%o2    ! (0_0) c2 = vis_fcmple32(DC1,y);

        faddd   %f6,%f28,%f24           ! (0_1) res += dtmp0;
        fcmpgt32        DC2,%f20,%g1    ! (0_0) c1 = vis_fcmpgt32(DC2,x);

        faddd   %f10,%f12,%f26          ! (0_1) dtmp1 += dtmp2;
        fcmpgt32        DC2,%f40,%o4    ! (0_0) c3 = vis_fcmpgt32(DC2,y);

        fand    %f32,DC0,%f52           ! (2_1) dmax = vis_fand(dmax,DC0);

        or      %g5,%o2,%g5             ! (0_0) c0 |= c2;
        fsubd   %f58,D2ON28,%f58        ! (1_1) x_hi -= D2ON28;

        andcc   %g5,2,%g0               ! (0_0) c0 & 2
        bnz,pn  %icc,.update12          ! (0_0) if ( (c0 & 2) != 0 )
        fsubd   %f22,D2ON28,%f22        ! (1_1) y_hi -= D2ON28;
.cont12:
        and     %g1,%o4,%g1             ! (0_0) c1 &= c3;
        faddd   %f24,%f26,%f12          ! (0_1) res += dtmp1;

        add     %i3,stridey,%l2         ! py += stridey
        andcc   %g1,2,%g0               ! (0_0) c1 & 2
        bnz,pn  %icc,.update13          ! (0_0) if ( (c1 & 2) != 0 )
        fmovd   %f18,%f44               ! (3_1) dmax = x;
.cont13:
        fpsub32 DC1,%f52,%f10           ! (2_1) dnorm = vis_fpsub32(DC1,dmax);
        add     %l1,stridex,%l7         ! px += stridex
        lda     [%l2]%asi,%f24          ! (1_0) ((float*)&y)[0] = ((float*)py)[0];

        fmuld   %f58,%f58,%f60          ! (1_1) res = x_hi * x_hi;
        add     %l6,stridez,%i5         ! pz += stridez
        lda     [%l2+4]%asi,%f25        ! (1_0) ((float*)&y)[1] = ((float*)py)[1];
        faddd   %f56,%f22,%f28          ! (1_1) dtmp2 = y + y_hi;

        fsqrtd  %f12,%f12               ! (0_1) res = sqrt(res);
        lda     [%l7]%asi,%f26          ! (1_0) ((float*)&x)[0] = ((float*)px)[0];
        faddd   %f36,%f58,%f6           ! (1_1) dtmp1 = x + x_hi;

        fmuld   %f50,%f10,%f50          ! (2_1) x *= dnorm;
        fsubd   %f36,%f58,%f58          ! (1_1) x_lo = x - x_hi;
        lda     [%l7+4]%asi,%f27        ! (1_0) ((float*)&x)[1] = ((float*)px)[1];

        fmuld   %f22,%f22,%f2           ! (1_1) dtmp0 = y_hi * y_hi;
        fsubd   %f56,%f22,%f56          ! (1_1) y_lo = y - y_hi;

        fmuld   %f34,%f10,%f34          ! (2_1) y *= dnorm;
        fabsd   %f24,%f54               ! (1_0) y = fabs(y);

        fmuld   %f14,%f4,%f14           ! (3_2) res = dmax * res;
        fabsd   %f26,%f36               ! (1_0) x = fabs(x);
        st      %f14,[%i5]              ! (3_2) ((float*)pz)[0] = ((float*)&res)[0];

        fmuld   %f6,%f58,%f10           ! (1_1) dtmp1 *= x_lo;
        st      %f15,[%i5+4]            ! (3_2) ((float*)pz)[1] = ((float*)&res)[1];
        fcmped  %fcc0,%f30,%f44         ! (3_1) dmax ? y

        fmuld   %f28,%f56,%f26          ! (1_1) dtmp2 *= y_lo;

        fmovdg  %fcc0,%f30,%f44         ! (3_1) if ( dmax < y ) dmax = y;

        faddd   %f50,D2ON28,%f58        ! (2_1) x_hi = x + D2ON28;
        fcmple32        DC1,%f36,%g1    ! (1_0) c0 = vis_fcmple32(DC1,x);

        faddd   %f34,D2ON28,%f22        ! (2_1) y_hi = y + D2ON28;
        fcmple32        DC1,%f54,%g5    ! (1_0) c2 = vis_fcmple32(DC1,y);

        faddd   %f60,%f2,%f24           ! (1_1) res += dtmp0;
        fcmpgt32        DC2,%f36,%o5    ! (1_0) c1 = vis_fcmpgt32(DC2,x);

        faddd   %f10,%f26,%f28          ! (1_1) dtmp1 += dtmp2;
        fcmpgt32        DC2,%f54,%o1    ! (1_0) c3 = vis_fcmpgt32(DC2,y);

        fand    %f44,DC0,%f14           ! (3_1) dmax = vis_fand(dmax,DC0);

        or      %g1,%g5,%g1             ! (1_0) c0 |= c2;
        fsubd   %f58,D2ON28,%f44        ! (2_1) x_hi -= D2ON28;

        andcc   %g1,2,%g0               ! (1_0) c0 & 2
        bnz,pn  %icc,.update14          ! (1_0) if ( (c0 & 2) != 0 )
        fsubd   %f22,D2ON28,%f58        ! (2_1) y_hi -= D2ON28;
.cont14:
        and     %o5,%o1,%o5             ! (1_0) c1 &= c3;
        faddd   %f24,%f28,%f26          ! (1_1) res += dtmp1;

        add     %l2,stridey,%i3         ! py += stridey
        andcc   %o5,2,%g0               ! (1_0) c1 & 2
        bnz,pn  %icc,.update15          ! (1_0) if ( (c1 & 2) != 0 )
        fmovd   %f20,%f4                ! (0_0) dmax = x;
.cont15:
        fpsub32 DC1,%f14,%f10           ! (3_1) dnorm = vis_fpsub32(DC1,dmax);
        add     %l7,stridex,%o1         ! px += stridex
        lda     [%i3]%asi,%f28          ! (2_0) ((float*)&y)[0] = ((float*)py)[0];

        fmuld   %f44,%f44,%f2           ! (2_1) res = x_hi * x_hi;
        add     %i5,stridez,%g5         ! pz += stridez
        lda     [%i3+4]%asi,%f29        ! (2_0) ((float*)&y)[1] = ((float*)py)[1];
        faddd   %f34,%f58,%f60          ! (2_1) dtmp2 = y + y_hi;

        fsqrtd  %f26,%f24               ! (1_1) res = sqrt(res);
        lda     [%o1]%asi,%f26          ! (2_0) ((float*)&x)[0] = ((float*)px)[0];
        faddd   %f50,%f44,%f56          ! (2_1) dtmp1 = x + x_hi;

        fmuld   %f18,%f10,%f6           ! (3_1) x *= dnorm;
        fsubd   %f50,%f44,%f18          ! (2_1) x_lo = x - x_hi;
        lda     [%o1+4]%asi,%f27        ! (2_0) ((float*)&x)[1] = ((float*)px)[1];

        fmuld   %f58,%f58,%f44          ! (2_1) dtmp0 = y_hi * y_hi;
        fsubd   %f34,%f58,%f22          ! (2_1) y_lo = y - y_hi;

        fmuld   %f30,%f10,%f58          ! (3_1) y *= dnorm;
        fabsd   %f28,%f34               ! (2_0) y = fabs(y);

        fmuld   %f16,%f12,%f16          ! (0_1) res = dmax * res;
        fabsd   %f26,%f50               ! (2_0) x = fabs(x);
        st      %f16,[%g5]              ! (0_1) ((float*)pz)[0] = ((float*)&res)[0];

        fmuld   %f56,%f18,%f10          ! (2_1) dtmp1 *= x_lo;
        st      %f17,[%g5+4]            ! (0_1) ((float*)pz)[1] = ((float*)&res)[1];
        fcmped  %fcc1,%f40,%f4          ! (0_0) dmax ? y

        fmuld   %f60,%f22,%f12          ! (2_1) dtmp2 *= y_lo;

        fmovdg  %fcc1,%f40,%f4          ! (0_0) if ( dmax < y ) dmax = y;

        faddd   %f6,D2ON28,%f56         ! (3_1) x_hi = x + D2ON28;
        fcmple32        DC1,%f50,%o3    ! (2_0) c0 = vis_fcmple32(DC1,x);

        faddd   %f58,D2ON28,%f28        ! (3_1) y_hi = y + D2ON28;
        fcmple32        DC1,%f34,%o0    ! (2_0) c2 = vis_fcmple32(DC1,y);

        faddd   %f2,%f44,%f30           ! (2_1) res += dtmp0;
        fcmpgt32        DC2,%f50,%o4    ! (2_0) c1 = vis_fcmpgt32(DC2,x);

        faddd   %f10,%f12,%f26          ! (2_1) dtmp1 += dtmp2;
        fcmpgt32        DC2,%f34,%o5    ! (2_0) c3 = vis_fcmpgt32(DC2,y);

        fand    %f4,DC0,%f16            ! (0_0) dmax = vis_fand(dmax,DC0);

        or      %o3,%o0,%o3             ! (2_0) c0 |= c2;
        fsubd   %f56,D2ON28,%f18        ! (3_1) x_hi -= D2ON28;

        andcc   %o3,2,%g0               ! (2_0) c0 & 2
        bnz,pn  %icc,.update16          ! (2_0) if ( (c0 & 2) != 0 )
        fsubd   %f28,D2ON28,%f4         ! (3_1) y_hi -= D2ON28;
.cont16:
        and     %o4,%o5,%o4             ! (2_0) c1 &= c3;
        faddd   %f30,%f26,%f12          ! (2_1) res += dtmp1;

        add     %i3,stridey,%l4         ! py += stridey
        andcc   %o4,2,%g0               ! (2_0) c1 & 2
        bnz,pn  %icc,.update17          ! (2_0) if ( (c1 & 2) != 0 )
        fmovd   %f36,%f56               ! (1_0) dmax = x;
.cont17:
        lda     [%l4]%asi,%f30          ! (3_0) ((float*)&y)[0] = ((float*)py)[0];
        add     %o1,stridex,%l2         ! px += stridex
        fpsub32 DC1,%f16,%f44           ! (0_0) dnorm = vis_fpsub32(DC1,dmax);

        fmuld   %f18,%f18,%f60          ! (3_1) res = x_hi * x_hi;
        add     %g5,stridez,%i5         ! pz += stridez
        lda     [%l4+4]%asi,%f31        ! (3_0) ((float*)&y)[1] = ((float*)py)[1];
        faddd   %f58,%f4,%f32           ! (3_1) dtmp2 = y + y_hi;

        fsqrtd  %f12,%f12               ! (2_1) res = sqrt(res);
        subcc   counter,4,counter       ! counter -= 4;
        bpos,pt %icc,.main_loop
        faddd   %f6,%f18,%f28           ! (3_1) dtmp1 = x + x_hi;

        add     counter,4,counter

.tail:
        subcc   counter,1,counter
        bneg,a  .begin
        nop

        fsubd   %f6,%f18,%f20           ! (3_2) x_lo = x - x_hi;

        fmuld   %f4,%f4,%f22            ! (3_2) dtmp0 = y_hi * y_hi;
        fsubd   %f58,%f4,%f58           ! (3_2) y_lo = y - y_hi;

        fmuld   %f38,%f24,%f10          ! (1_2) res = dmax * res;
        st      %f10,[%i5]              ! (1_2) ((float*)pz)[0] = ((float*)&res)[0];

        st      %f11,[%i5+4]            ! (1_2) ((float*)pz)[1] = ((float*)&res)[1];

        subcc   counter,1,counter
        bneg,a  .begin
        add     %i5,stridez,%i5

        fmuld   %f28,%f20,%f28          ! (3_2) dtmp1 *= x_lo;

        fmuld   %f32,%f58,%f24          ! (3_2) dtmp2 *= y_lo;

        faddd   %f60,%f22,%f22          ! (3_2) res += dtmp0;

        faddd   %f28,%f24,%f26          ! (3_2) dtmp1 += dtmp2;

        faddd   %f22,%f26,%f28          ! (3_2) res += dtmp1;

        add     %i5,stridez,%l6         ! pz += stridez

        fsqrtd  %f28,%f4                ! (3_2) res = sqrt(res);
        add     %l2,stridex,%l1         ! px += stridex

        fmuld   %f52,%f12,%f12          ! (2_2) res = dmax * res;
        st      %f12,[%l6]              ! (2_2) ((float*)pz)[0] = ((float*)&res)[0];

        st      %f13,[%l6+4]            ! (2_2) ((float*)pz)[1] = ((float*)&res)[1];

        subcc   counter,1,counter
        bneg    .begin
        add     %l6,stridez,%i5

        fmuld   %f14,%f4,%f14           ! (3_2) res = dmax * res;
        st      %f14,[%i5]              ! (3_2) ((float*)pz)[0] = ((float*)&res)[0];

        st      %f15,[%i5+4]            ! (3_2) ((float*)pz)[1] = ((float*)&res)[1];

        ba      .begin
        add     %i5,stridez,%i5

        .align  16
.spec0:
        ld      [%i1+4],%l1             ! lx = ((int*)px)[1];
        cmp     %o2,%o4                 ! j0 ? 0x7ff00000
        bge,pn  %icc,1f                 ! if ( j0 >= 0x7ff00000 )
        fabsd   %f26,%f26               ! x = fabs(x);

        sub     %o0,%l4,%o0             ! diff = hy - hx;
        fabsd   %f24,%f24               ! y = fabs(y);

        sra     %o0,31,%l4              ! j0 = diff >> 31;

        xor     %o0,%l4,%o0             ! diff ^ j0

        sethi   %hi(0x03600000),%l1
        sub     %o0,%l4,%o0             ! (diff ^ j0) - j0

        cmp     %o0,%l1                 ! ((diff ^ j0) - j0) ? 0x03600000
        bge,a,pn        %icc,2f         ! if ( ((diff ^ j0) - j0) >= 0x03600000 )
        faddd   %f26,%f24,%f24          ! *pz = x + y

        fmuld   %f26,DC2,%f36           ! (1_1) x *= dnorm;

        fmuld   %f24,DC2,%f56           ! (1_1) y *= dnorm;

        faddd   %f36,D2ON28,%f58        ! (1_1) x_hi = x + D2ON28;

        faddd   %f56,D2ON28,%f22        ! (1_1) y_hi = y + D2ON28;

        fsubd   %f58,D2ON28,%f58        ! (1_1) x_hi -= D2ON28;

        fsubd   %f22,D2ON28,%f22        ! (1_1) y_hi -= D2ON28;

        fmuld   %f58,%f58,%f60          ! (1_1) res = x_hi * x_hi;
        faddd   %f56,%f22,%f28          ! (1_1) dtmp2 = y + y_hi;

        faddd   %f36,%f58,%f6           ! (1_1) dtmp1 = x + x_hi;

        fsubd   %f36,%f58,%f58          ! (1_1) x_lo = x - x_hi;

        fmuld   %f22,%f22,%f2           ! (1_1) dtmp0 = y_hi * y_hi;
        fsubd   %f56,%f22,%f56          ! (1_1) y_lo = y - y_hi;

        fmuld   %f6,%f58,%f10           ! (1_1) dtmp1 *= x_lo;

        fmuld   %f28,%f56,%f26          ! (1_1) dtmp2 *= y_lo;

        faddd   %f60,%f2,%f24           ! (1_1) res += dtmp0;

        faddd   %f10,%f26,%f28          ! (1_1) dtmp1 += dtmp2;

        faddd   %f24,%f28,%f26          ! (1_1) res += dtmp1;

        fsqrtd  %f26,%f24               ! (1_1) res = sqrt(res);

        fmuld   DC3,%f24,%f24           ! (1_2) res = dmax * res;
2:
        add     %i3,stridey,%i3
        add     %i1,stridex,%i1
        st      %f24,[%i5]              ! ((float*)pz)[0] = ((float*)&res)[0];
        st      %f25,[%i5+4]            ! ((float*)pz)[1] = ((float*)&res)[1];

        add     %i5,stridez,%i5
        ba      .begin1
        sub     counter,1,counter

1:
        ld      [%i3+4],%l2             ! ly = ((int*)py)[1];
        cmp     %o0,%o4                 ! hx ? 0x7ff00000
        bne,pn  %icc,1f                 ! if ( hx != 0x7ff00000 )
        fabsd   %f24,%f24               ! y = fabs(y);

        cmp     %l1,0                   ! lx ? 0
        be,pn   %icc,2f                 ! if ( lx == 0 )
        nop
1:
        cmp     %l4,%o4                 ! hy ? 0x7ff00000
        bne,pn  %icc,1f                 ! if ( hy != 0x7ff00000 )
        nop

        cmp     %l2,0                   ! ly ? 0
        be,pn   %icc,2f                 ! if ( ly == 0 )
        nop
1:
        add     %i3,stridey,%i3
        add     %i1,stridex,%i1
        fmuld   %f26,%f24,%f24          ! res = x * y;
        st      %f24,[%i5]              ! ((float*)pz)[0] = ((float*)&res)[0];

        st      %f25,[%i5+4]            ! ((float*)pz)[1] = ((float*)&res)[1];

        add     %i5,stridez,%i5
        ba      .begin1
        sub     counter,1,counter

2:
        add     %i1,stridex,%i1
        add     %i3,stridey,%i3
        st      DC0_HI,[%i5]            ! ((int*)pz)[0] = 0x7ff00000;
        st      DC0_LO,[%i5+4]          ! ((int*)pz)[1] = 0;
        fcmpd   %f26,%f24               ! x ? y

        add     %i5,stridez,%i5
        ba      .begin1
        sub     counter,1,counter

        .align  16
.spec1:
        fmuld   %f26,DC3,%f36           ! (1_1) x *= dnorm;

        fmuld   %f24,DC3,%f56           ! (1_1) y *= dnorm;

        faddd   %f36,D2ON28,%f58        ! (1_1) x_hi = x + D2ON28;

        faddd   %f56,D2ON28,%f22        ! (1_1) y_hi = y + D2ON28;

        fsubd   %f58,D2ON28,%f58        ! (1_1) x_hi -= D2ON28;

        fsubd   %f22,D2ON28,%f22        ! (1_1) y_hi -= D2ON28;

        fmuld   %f58,%f58,%f60          ! (1_1) res = x_hi * x_hi;
        faddd   %f56,%f22,%f28          ! (1_1) dtmp2 = y + y_hi;

        faddd   %f36,%f58,%f6           ! (1_1) dtmp1 = x + x_hi;

        fsubd   %f36,%f58,%f58          ! (1_1) x_lo = x - x_hi;

        fmuld   %f22,%f22,%f2           ! (1_1) dtmp0 = y_hi * y_hi;
        fsubd   %f56,%f22,%f56          ! (1_1) y_lo = y - y_hi;

        fmuld   %f6,%f58,%f10           ! (1_1) dtmp1 *= x_lo;

        fmuld   %f28,%f56,%f26          ! (1_1) dtmp2 *= y_lo;

        faddd   %f60,%f2,%f24           ! (1_1) res += dtmp0;

        faddd   %f10,%f26,%f28          ! (1_1) dtmp1 += dtmp2;

        faddd   %f24,%f28,%f26          ! (1_1) res += dtmp1;

        fsqrtd  %f26,%f24               ! (1_1) res = sqrt(res);

        fmuld   DC2,%f24,%f24           ! (1_2) res = dmax * res;

        add     %i3,stridey,%i3
        add     %i1,stridex,%i1
        st      %f24,[%i5]              ! ((float*)pz)[0] = ((float*)&res)[0];

        st      %f25,[%i5+4]            ! ((float*)pz)[1] = ((float*)&res)[1];
        add     %i5,stridez,%i5
        ba      .begin1
        sub     counter,1,counter

        .align  16
.update0:
        fzero   %f50
        cmp     counter,1
        ble     .cont0
        fzero   %f34

        mov     %o1,tmp_px
        mov     %i3,tmp_py

        sub     counter,1,tmp_counter
        ba      .cont0
        mov     1,counter

        .align  16
.update1:
        fzero   %f50
        cmp     counter,1
        ble     .cont1
        fzero   %f34

        mov     %o1,tmp_px
        mov     %i3,tmp_py

        sub     counter,1,tmp_counter
        ba      .cont1
        mov     1,counter

        .align  16
.update2:
        fzero   %f18
        cmp     counter,2
        ble     .cont2
        fzero   %f30

        mov     %l2,tmp_px
        mov     %l4,tmp_py

        sub     counter,2,tmp_counter
        ba      .cont1
        mov     2,counter

        .align  16
.update3:
        fzero   %f18
        cmp     counter,2
        ble     .cont3
        fzero   %f30

        mov     %l2,tmp_px
        mov     %l4,tmp_py

        sub     counter,2,tmp_counter
        ba      .cont3
        mov     2,counter

        .align  16
.update4:
        fzero   %f20
        cmp     counter,3
        ble     .cont4
        fzero   %f40

        mov     %l1,tmp_px
        mov     %i3,tmp_py

        sub     counter,3,tmp_counter
        ba      .cont4
        mov     3,counter

        .align  16
.update5:
        fzero   %f20
        cmp     counter,3
        ble     .cont5
        fzero   %f40

        mov     %l1,tmp_px
        mov     %i3,tmp_py

        sub     counter,3,tmp_counter
        ba      .cont5
        mov     3,counter

        .align  16
.update6:
        fzero   %f36
        cmp     counter,4
        ble     .cont6
        fzero   %f54

        mov     %l7,tmp_px
        mov     %l2,tmp_py

        sub     counter,4,tmp_counter
        ba      .cont6
        mov     4,counter

        .align  16
.update7:
        fzero   %f36
        cmp     counter,4
        ble     .cont7
        fzero   %f54

        mov     %l7,tmp_px
        mov     %l2,tmp_py

        sub     counter,4,tmp_counter
        ba      .cont7
        mov     4,counter

        .align  16
.update8:
        fzero   %f50
        cmp     counter,5
        ble     .cont8
        fzero   %f34

        mov     %o1,tmp_px
        mov     %i3,tmp_py

        sub     counter,5,tmp_counter
        ba      .cont8
        mov     5,counter

        .align  16
.update9:
        fzero   %f50
        cmp     counter,5
        ble     .cont9
        fzero   %f34

        mov     %o1,tmp_px
        mov     %i3,tmp_py

        sub     counter,5,tmp_counter
        ba      .cont9
        mov     5,counter


        .align  16
.update10:
        fzero   %f18
        cmp     counter,2
        ble     .cont10
        fzero   %f30

        mov     %l2,tmp_px
        mov     %l4,tmp_py

        sub     counter,2,tmp_counter
        ba      .cont10
        mov     2,counter

        .align  16
.update11:
        fzero   %f18
        cmp     counter,2
        ble     .cont11
        fzero   %f30

        mov     %l2,tmp_px
        mov     %l4,tmp_py

        sub     counter,2,tmp_counter
        ba      .cont11
        mov     2,counter

        .align  16
.update12:
        fzero   %f20
        cmp     counter,3
        ble     .cont12
        fzero   %f40

        mov     %l1,tmp_px
        mov     %i3,tmp_py

        sub     counter,3,tmp_counter
        ba      .cont12
        mov     3,counter

        .align  16
.update13:
        fzero   %f20
        cmp     counter,3
        ble     .cont13
        fzero   %f40

        mov     %l1,tmp_px
        mov     %i3,tmp_py

        sub     counter,3,tmp_counter
        ba      .cont13
        mov     3,counter

        .align  16
.update14:
        fzero   %f54
        cmp     counter,4
        ble     .cont14
        fzero   %f36

        mov     %l7,tmp_px
        mov     %l2,tmp_py

        sub     counter,4,tmp_counter
        ba      .cont14
        mov     4,counter

        .align  16
.update15:
        fzero   %f54
        cmp     counter,4
        ble     .cont15
        fzero   %f36

        mov     %l7,tmp_px
        mov     %l2,tmp_py

        sub     counter,4,tmp_counter
        ba      .cont15
        mov     4,counter

        .align  16
.update16:
        fzero   %f50
        cmp     counter,5
        ble     .cont16
        fzero   %f34

        mov     %o1,tmp_px
        mov     %i3,tmp_py

        sub     counter,5,tmp_counter
        ba      .cont16
        mov     5,counter

        .align  16
.update17:
        fzero   %f50
        cmp     counter,5
        ble     .cont17
        fzero   %f34

        mov     %o1,tmp_px
        mov     %i3,tmp_py

        sub     counter,5,tmp_counter
        ba      .cont17
        mov     5,counter

        .align  16
.exit:
        ret
        restore
        SET_SIZE(__vhypot)