root/usr/src/common/bignum/sun4u/mont_mulf_kernel_v9.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * This file is mostly a result of compiling the mont_mulf.c file to generate an
 * assembly output and then hand-editing that output to replace the
 * compiler-generated loop for the 512-bit case (nlen == 16) in the
 * mont_mulf_noconv routine with a hand-crafted version. This file also
 * has big_savefp() and big_restorefp() routines added by hand.
 */

#include <sys/asm_linkage.h>
#include <sys/trap.h>
#include <sys/stack.h>
#include <sys/privregs.h>
#include <sys/regset.h>
#include <sys/vis.h>
#include <sys/machthread.h>
#include <sys/machtrap.h>
#include <sys/machsig.h>

        .section        ".text",#alloc,#execinstr
        .file   "mont_mulf.s"

        .section        ".bss",#alloc,#write
Bbss.bss:

        .section        ".data",#alloc,#write
Ddata.data:

        .section        ".rodata",#alloc
!
! CONSTANT POOL
!
Drodata.rodata:
        .global TwoTo16
        .align  8
!
! CONSTANT POOL
!
        .global TwoTo16
TwoTo16:
        .word   1089470464
        .word   0
        .type   TwoTo16,#object
        .size   TwoTo16,8
        .global TwoToMinus16
!
! CONSTANT POOL
!
        .global TwoToMinus16
TwoToMinus16:
        .word   1055916032
        .word   0
        .type   TwoToMinus16,#object
        .size   TwoToMinus16,8
        .global Zero
!
! CONSTANT POOL
!
        .global Zero
Zero:
        .word   0
        .word   0
        .type   Zero,#object
        .size   Zero,8
        .global TwoTo32
!
! CONSTANT POOL
!
        .global TwoTo32
TwoTo32:
        .word   1106247680
        .word   0
        .type   TwoTo32,#object
        .size   TwoTo32,8
        .global TwoToMinus32
!
! CONSTANT POOL
!
        .global TwoToMinus32
TwoToMinus32:
        .word   1039138816
        .word   0
        .type   TwoToMinus32,#object
        .size   TwoToMinus32,8

        .section        ".text",#alloc,#execinstr
/* 000000          0 */         .register       %g3,#scratch
/* 000000            */         .register       %g2,#scratch
/* 000000          0 */         .align  32
! FILE mont_mulf.c

!    1                !/*
!    2                ! * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
!    3                ! * Use is subject to license terms.
!    4                ! */
!    6                !#pragma ident    "@(#)mont_mulf.c        1.2     01/09/24 SMI"
!    9                !/*
!   10                ! * If compiled without -DRF_INLINE_MACROS then needs -lm at link time
!   11                ! * If compiled with -DRF_INLINE_MACROS then needs conv.il at compile time
!   12                ! * (i.e. cc <compileer_flags> -DRF_INLINE_MACROS conv.il mont_mulf.c )
!   13                ! */
!   15                !#include <sys/types.h>
!   16                !#include <math.h>
!   18                !static const double TwoTo16 = 65536.0;
!   19                !static const double TwoToMinus16 = 1.0/65536.0;
!   20                !static const double Zero = 0.0;
!   21                !static const double TwoTo32 = 65536.0 * 65536.0;
!   22                !static const double TwoToMinus32 = 1.0 / (65536.0 * 65536.0);
!   24                !#ifdef RF_INLINE_MACROS
!   26                !double upper32(double);
!   27                !double lower32(double, double);
!   28                !double mod(double, double, double);
!   30                !#else
!   32                !static double
!   33                !upper32(double x)
!   34                !{
!   35                ! return (floor(x * TwoToMinus32));
!   36                !}
!   39                !/* ARGSUSED */
!   40                !static double
!   41                !lower32(double x, double y)
!   42                !{
!   43                ! return (x - TwoTo32 * floor(x * TwoToMinus32));
!   44                !}
!   46                !static double
!   47                !mod(double x, double oneoverm, double m)
!   48                !{
!   49                ! return (x - m * floor(x * oneoverm));
!   50                !}
!   52                !#endif
!   55                !static void
!   56                !cleanup(double *dt, int from, int tlen)
!   57                !{

!
! SUBROUTINE cleanup
!
! OFFSET    SOURCE LINE LABEL   INSTRUCTION

                       cleanup:
/* 000000         57 */         sra     %o1,0,%o4
/* 0x0004            */         sra     %o2,0,%o5

!   58                ! int i;
!   59                ! double tmp, tmp1, x, x1;
!   61                ! tmp = tmp1 = Zero;

/* 0x0008         61 */         sll     %o5,1,%g5

!   63                ! for (i = 2 * from; i < 2 * tlen; i += 2) {

/* 0x000c         63 */         sll     %o4,1,%g3
/* 0x0010            */         cmp     %g3,%g5
/* 0x0014            */         bge,pn  %icc,.L77000188
/* 0x0018          0 */         sethi   %hi(Zero),%o3
                       .L77000197:
/* 0x001c         63 */         ldd     [%o3+%lo(Zero)],%f8
/* 0x0020            */         sra     %g3,0,%o1
/* 0x0024            */         sub     %g5,1,%g2
/* 0x0028            */         sllx    %o1,3,%g4

!   64                !         x = dt[i];

/* 0x002c         64 */         ldd     [%g4+%o0],%f10
/* 0x0030         63 */         add     %g4,%o0,%g1
/* 0x0034            */         fmovd   %f8,%f18
/* 0x0038            */         fmovd   %f8,%f16

!   65                !         x1 = dt[i + 1];
!   66                !         dt[i] = lower32(x, Zero) + tmp;

                       .L900000110:
/* 0x003c         66 */         fdtox   %f10,%f0
/* 0x0040         65 */         ldd     [%g1+8],%f12

!   67                !         dt[i + 1] = lower32(x1, Zero) + tmp1;
!   68                !         tmp = upper32(x);
!   69                !         tmp1 = upper32(x1);

/* 0x0044         69 */         add     %g3,2,%g3
/* 0x0048            */         cmp     %g3,%g2
/* 0x004c         67 */         fdtox   %f12,%f2
/* 0x0050         68 */         fmovd   %f0,%f4
/* 0x0054         66 */         fmovs   %f8,%f0
/* 0x0058         67 */         fmovs   %f8,%f2
/* 0x005c         66 */         fxtod   %f0,%f0
/* 0x0060         67 */         fxtod   %f2,%f2
/* 0x0064         69 */         fdtox   %f12,%f6
/* 0x0068         66 */         faddd   %f0,%f18,%f10
/* 0x006c            */         std     %f10,[%g1]
/* 0x0070         67 */         faddd   %f2,%f16,%f14
/* 0x0074            */         std     %f14,[%g1+8]
/* 0x0078         68 */         fitod   %f4,%f18
/* 0x007c         69 */         add     %g1,16,%g1
/* 0x0080            */         fitod   %f6,%f16
/* 0x0084            */         ble,a,pt        %icc,.L900000110
/* 0x0088         64 */         ldd     [%g1],%f10
                       .L77000188:
/* 0x008c         69 */         retl    ! Result =
/* 0x0090            */         nop
/* 0x0094          0 */         .type   cleanup,2
/* 0x0094          0 */         .size   cleanup,(.-cleanup)

        .section        ".text",#alloc,#execinstr
/* 000000          0 */         .align  8
/* 000000            */         .skip   24
/* 0x0018            */         .align  32

!   70                ! }
!   71                !}
!   75                !#ifdef _KERNEL
!   76                !/*
!   77                ! * This only works if  0 <= d < 2^53
!   78                ! */
!   79                !uint64_t
!   80                !double2uint64_t(double* d)
!   81                !{
!   82                ! uint64_t x;
!   83                ! uint64_t exp;
!   84                ! uint64_t man;
!   86                ! x = *((uint64_t *)d);

!
! SUBROUTINE double2uint64_t
!
! OFFSET    SOURCE LINE LABEL   INSTRUCTION

                        .global double2uint64_t
                       double2uint64_t:
/* 000000         86 */         ldx     [%o0],%o2

!   87                ! if (x == 0) {

/* 0x0004         87 */         cmp     %o2,0
/* 0x0008            */         bne,pn  %xcc,.L900000206
/* 0x000c         94 */         sethi   %hi(0xfff00000),%o5
                       .L77000202:
/* 0x0010         94 */         retl    ! Result =  %o0

!   88                !         return (0ULL);

/* 0x0014         88 */         or      %g0,0,%o0

!   89                ! }
!   90                ! exp = (x >> 52) - 1023;
!   91                ! man = (x & 0xfffffffffffffULL) | 0x10000000000000ULL;
!   92                ! x = man >> (52 - exp);
!   94                ! return (x);

                       .L900000206:
/* 0x0018         94 */         sllx    %o5,32,%o4
/* 0x001c            */         srlx    %o2,52,%o0
/* 0x0020            */         sethi   %hi(0x40000000),%o1
/* 0x0024            */         or      %g0,1023,%g5
/* 0x0028            */         sllx    %o1,22,%g4
/* 0x002c            */         xor     %o4,-1,%o3
/* 0x0030            */         sub     %g5,%o0,%g3
/* 0x0034            */         and     %o2,%o3,%g2
/* 0x0038            */         or      %g2,%g4,%o5
/* 0x003c            */         add     %g3,52,%g1
/* 0x0040            */         retl    ! Result =  %o0
/* 0x0044            */         srlx    %o5,%g1,%o0
/* 0x0048          0 */         .type   double2uint64_t,2
/* 0x0048          0 */         .size   double2uint64_t,(.-double2uint64_t)

        .section        ".text",#alloc,#execinstr
/* 000000          0 */         .align  8
/* 000000            */         .skip   24
/* 0x0018            */         .align  32

!   95                !}
!   96                !#else
!   97                !/*
!   98                ! * This only works if  0 <= d < 2^63
!   99                ! */
!  100                !uint64_t
!  101                !double2uint64_t(double* d)
!  102                !{
!  103                ! return ((int64_t)(*d));
!  104                !}
!  105                !#endif
!  107                !/* ARGSUSED */
!  108                !void
!  109                !conv_d16_to_i32(uint32_t *i32, double *d16, int64_t *tmp, int ilen)
!  110                !{

!
! SUBROUTINE conv_d16_to_i32
!
! OFFSET    SOURCE LINE LABEL   INSTRUCTION

                        .global conv_d16_to_i32
                       conv_d16_to_i32:
/* 000000        110 */         save    %sp,-176,%sp

!  111                ! int i;
!  112                ! int64_t t, t1,          /* using int64_t and not uint64_t */
!  113                !         a, b, c, d;     /* because more efficient code is */
!  114                !                         /* generated this way, and there  */
!  115                !                         /* is no overflow  */
!  116                ! t1 = 0;
!  117                ! a = double2uint64_t(&(d16[0]));

/* 0x0004        117 */         ldx     [%i1],%o0
/* 0x0008        118 */         ldx     [%i1+8],%i2
/* 0x000c        117 */         cmp     %o0,0
/* 0x0010            */         bne,pn  %xcc,.L77000216
/* 0x0014            */         or      %g0,0,%i4
                       .L77000215:
/* 0x0018        117 */         ba      .L900000316
/* 0x001c        118 */         cmp     %i2,0
                       .L77000216:
/* 0x0020        117 */         srlx    %o0,52,%o5
/* 0x0024            */         sethi   %hi(0xfff00000),%i4
/* 0x0028            */         sllx    %i4,32,%o2
/* 0x002c            */         sethi   %hi(0x40000000),%o7
/* 0x0030            */         sllx    %o7,22,%o3
/* 0x0034            */         or      %g0,1023,%o4
/* 0x0038            */         xor     %o2,-1,%g5
/* 0x003c            */         sub     %o4,%o5,%l0
/* 0x0040            */         and     %o0,%g5,%o1
/* 0x0044            */         add     %l0,52,%l1
/* 0x0048            */         or      %o1,%o3,%g4

!  118                ! b = double2uint64_t(&(d16[1]));

/* 0x004c        118 */         cmp     %i2,0
/* 0x0050        117 */         srlx    %g4,%l1,%i4
                       .L900000316:
/* 0x0054        118 */         bne,pn  %xcc,.L77000222
/* 0x0058        134 */         sub     %i3,1,%l3
                       .L77000221:
/* 0x005c        118 */         or      %g0,0,%i2
/* 0x0060            */         ba      .L900000315
/* 0x0064        116 */         or      %g0,0,%o3
                       .L77000222:
/* 0x0068        118 */         srlx    %i2,52,%l6
/* 0x006c            */         sethi   %hi(0xfff00000),%g4
/* 0x0070            */         sllx    %g4,32,%i5
/* 0x0074            */         sethi   %hi(0x40000000),%l5
/* 0x0078            */         xor     %i5,-1,%l4
/* 0x007c            */         or      %g0,1023,%l2
/* 0x0080            */         and     %i2,%l4,%l7
/* 0x0084            */         sllx    %l5,22,%i2
/* 0x0088            */         sub     %l2,%l6,%g1
/* 0x008c            */         or      %l7,%i2,%g3
/* 0x0090            */         add     %g1,52,%g2
/* 0x0094        116 */         or      %g0,0,%o3
/* 0x0098        118 */         srlx    %g3,%g2,%i2

!  119                ! for (i = 0; i < ilen - 1; i++) {

                       .L900000315:
/* 0x009c        119 */         cmp     %l3,0
/* 0x00a0            */         ble,pn  %icc,.L77000210
/* 0x00a4            */         or      %g0,0,%l4
                       .L77000245:
/* 0x00a8        118 */         sethi   %hi(0xfff00000),%l7
/* 0x00ac            */         or      %g0,-1,%l6
/* 0x00b0            */         sllx    %l7,32,%l3
/* 0x00b4            */         srl     %l6,0,%l6
/* 0x00b8            */         sethi   %hi(0x40000000),%l1
/* 0x00bc            */         sethi   %hi(0xfc00),%l2
/* 0x00c0            */         xor     %l3,-1,%l7
/* 0x00c4            */         sllx    %l1,22,%l3
/* 0x00c8            */         sub     %i3,2,%l5
/* 0x00cc            */         add     %l2,1023,%l2
/* 0x00d0            */         or      %g0,2,%g2
/* 0x00d4            */         or      %g0,%i0,%g1

!  120                !         c = double2uint64_t(&(d16[2 * i + 2]));

                       .L77000208:
/* 0x00d8        120 */         sra     %g2,0,%g3
/* 0x00dc        123 */         add     %g2,1,%o2
/* 0x00e0        120 */         sllx    %g3,3,%i3

!  121                !         t1 += a & 0xffffffff;
!  122                !         t = (a >> 32);
!  123                !         d = double2uint64_t(&(d16[2 * i + 3]));

/* 0x00e4        123 */         sra     %o2,0,%g5
/* 0x00e8        120 */         ldx     [%i1+%i3],%o5
/* 0x00ec        123 */         sllx    %g5,3,%o0
/* 0x00f0        121 */         and     %i4,%l6,%g4
/* 0x00f4        123 */         ldx     [%i1+%o0],%i3
/* 0x00f8        120 */         cmp     %o5,0
/* 0x00fc            */         bne,pn  %xcc,.L77000228
/* 0x0100        124 */         and     %i2,%l2,%i5
                       .L77000227:
/* 0x0104        120 */         or      %g0,0,%l1
/* 0x0108            */         ba      .L900000314
/* 0x010c        121 */         add     %o3,%g4,%o0
                       .L77000228:
/* 0x0110        120 */         srlx    %o5,52,%o7
/* 0x0114            */         and     %o5,%l7,%o5
/* 0x0118            */         or      %g0,52,%l0
/* 0x011c            */         sub     %o7,1023,%o4
/* 0x0120            */         or      %o5,%l3,%l1
/* 0x0124            */         sub     %l0,%o4,%o1
/* 0x0128            */         srlx    %l1,%o1,%l1
/* 0x012c        121 */         add     %o3,%g4,%o0
                       .L900000314:
/* 0x0130        122 */         srax    %i4,32,%g3
/* 0x0134        123 */         cmp     %i3,0
/* 0x0138            */         bne,pn  %xcc,.L77000234
/* 0x013c        124 */         sllx    %i5,16,%g5
                       .L77000233:
/* 0x0140        123 */         or      %g0,0,%o2
/* 0x0144            */         ba      .L900000313
/* 0x0148        124 */         add     %o0,%g5,%o7
                       .L77000234:
/* 0x014c        123 */         srlx    %i3,52,%o2
/* 0x0150            */         and     %i3,%l7,%i4
/* 0x0154            */         sub     %o2,1023,%o1
/* 0x0158            */         or      %g0,52,%g4
/* 0x015c            */         sub     %g4,%o1,%i5
/* 0x0160            */         or      %i4,%l3,%i3
/* 0x0164            */         srlx    %i3,%i5,%o2

!  124                !         t1 += (b & 0xffff) << 16;

/* 0x0168        124 */         add     %o0,%g5,%o7

!  125                !         t += (b >> 16) + (t1 >> 32);

                       .L900000313:
/* 0x016c        125 */         srax    %i2,16,%l0
/* 0x0170            */         srax    %o7,32,%o4
/* 0x0174            */         add     %l0,%o4,%o3

!  126                !         i32[i] = t1 & 0xffffffff;
!  127                !         t1 = t;
!  128                !         a = c;
!  129                !         b = d;

/* 0x0178        129 */         add     %l4,1,%l4
/* 0x017c        126 */         and     %o7,%l6,%o5
/* 0x0180        125 */         add     %g3,%o3,%o3
/* 0x0184        126 */         st      %o5,[%g1]
/* 0x0188        128 */         or      %g0,%l1,%i4
/* 0x018c        129 */         or      %g0,%o2,%i2
/* 0x0190            */         add     %g2,2,%g2
/* 0x0194            */         cmp     %l4,%l5
/* 0x0198            */         ble,pt  %icc,.L77000208
/* 0x019c            */         add     %g1,4,%g1

!  130                ! }
!  131                ! t1 += a & 0xffffffff;
!  132                ! t = (a >> 32);
!  133                ! t1 += (b & 0xffff) << 16;
!  134                ! i32[i] = t1 & 0xffffffff;

                       .L77000210:
/* 0x01a0        134 */         sra     %l4,0,%l4
/* 0x01a4            */         sethi   %hi(0xfc00),%i1
/* 0x01a8            */         add     %o3,%i4,%l2
/* 0x01ac            */         add     %i1,1023,%i5
/* 0x01b0            */         and     %i2,%i5,%l5
/* 0x01b4            */         sllx    %l4,2,%i2
/* 0x01b8            */         sllx    %l5,16,%l6
/* 0x01bc            */         add     %l2,%l6,%l7
/* 0x01c0            */         st      %l7,[%i0+%i2]
/* 0x01c4        129 */         ret     ! Result =
/* 0x01c8            */         restore %g0,%g0,%g0
/* 0x01cc          0 */         .type   conv_d16_to_i32,2
/* 0x01cc          0 */         .size   conv_d16_to_i32,(.-conv_d16_to_i32)

        .section        ".text",#alloc,#execinstr
/* 000000          0 */         .align  8
!
! CONSTANT POOL
!
                       ___const_seg_900000401:
/* 000000          0 */         .word   1127219200,0
/* 0x0008            */         .word   1127219200
/* 0x000c          0 */         .type   ___const_seg_900000401,1
/* 0x000c          0 */         .size   ___const_seg_900000401,(.-___const_seg_900000401)
/* 0x000c          0 */         .align  8
/* 0x0010            */         .skip   24
/* 0x0028            */         .align  32

!  135                !}
!  138                !void
!  139                !conv_i32_to_d32(double *d32, uint32_t *i32, int len)
!  140                !{

!
! SUBROUTINE conv_i32_to_d32
!
! OFFSET    SOURCE LINE LABEL   INSTRUCTION

                        .global conv_i32_to_d32
                       conv_i32_to_d32:
/* 000000        140 */         orcc    %g0,%o2,%o2

!  141                ! int i;
!  143                !#pragma pipeloop(0)
!  144                ! for (i = 0; i < len; i++)

/* 0x0004        144 */         ble,pn  %icc,.L77000254
/* 0x0008            */         sub     %o2,1,%o3
                       .L77000263:
/* 0x000c        140 */         or      %g0,%o0,%o2

!  145                !         d32[i] = (double)(i32[i]);

/* 0x0010        145 */         add     %o3,1,%o5
/* 0x0014        144 */         or      %g0,0,%g5
/* 0x0018        145 */         cmp     %o5,10
/* 0x001c            */         bl,pn   %icc,.L77000261
/* 0x0020            */         sethi   %hi(___const_seg_900000401),%g4
                       .L900000407:
/* 0x0024        145 */         prefetch        [%o1],0
/* 0x0028            */         prefetch        [%o0],22
/* 0x002c            */         sethi   %hi(___const_seg_900000401+8),%o4
/* 0x0030            */         or      %g0,%o0,%o2
/* 0x0034            */         prefetch        [%o1+64],0
/* 0x0038            */         add     %o1,8,%o0
/* 0x003c            */         sub     %o3,7,%o5
/* 0x0040            */         prefetch        [%o2+64],22
/* 0x0044            */         or      %g0,2,%g5
/* 0x0048            */         prefetch        [%o2+128],22
/* 0x004c            */         prefetch        [%o2+192],22
/* 0x0050            */         prefetch        [%o1+128],0
/* 0x0054            */         ld      [%o4+%lo(___const_seg_900000401+8)],%f2
/* 0x0058            */         ldd     [%g4+%lo(___const_seg_900000401)],%f16
/* 0x005c            */         fmovs   %f2,%f0
/* 0x0060            */         prefetch        [%o2+256],22
/* 0x0064            */         prefetch        [%o2+320],22
/* 0x0068            */         ld      [%o1],%f3
/* 0x006c            */         prefetch        [%o1+192],0
/* 0x0070            */         ld      [%o1+4],%f1
                       .L900000405:
/* 0x0074        145 */         prefetch        [%o0+188],0
/* 0x0078            */         fsubd   %f2,%f16,%f22
/* 0x007c            */         add     %g5,8,%g5
/* 0x0080            */         add     %o0,32,%o0
/* 0x0084            */         ld      [%o4+%lo(___const_seg_900000401+8)],%f4
/* 0x0088            */         std     %f22,[%o2]
/* 0x008c            */         cmp     %g5,%o5
/* 0x0090            */         ld      [%o0-32],%f5
/* 0x0094            */         fsubd   %f0,%f16,%f24
/* 0x0098            */         add     %o2,64,%o2
/* 0x009c            */         fmovs   %f4,%f0
/* 0x00a0            */         std     %f24,[%o2-56]
/* 0x00a4            */         ld      [%o0-28],%f1
/* 0x00a8            */         fsubd   %f4,%f16,%f26
/* 0x00ac            */         fmovs   %f0,%f6
/* 0x00b0            */         prefetch        [%o2+312],22
/* 0x00b4            */         std     %f26,[%o2-48]
/* 0x00b8            */         ld      [%o0-24],%f7
/* 0x00bc            */         fsubd   %f0,%f16,%f28
/* 0x00c0            */         fmovs   %f6,%f8
/* 0x00c4            */         std     %f28,[%o2-40]
/* 0x00c8            */         ld      [%o0-20],%f9
/* 0x00cc            */         fsubd   %f6,%f16,%f30
/* 0x00d0            */         fmovs   %f8,%f10
/* 0x00d4            */         std     %f30,[%o2-32]
/* 0x00d8            */         ld      [%o0-16],%f11
/* 0x00dc            */         prefetch        [%o2+344],22
/* 0x00e0            */         fsubd   %f8,%f16,%f48
/* 0x00e4            */         fmovs   %f10,%f12
/* 0x00e8            */         std     %f48,[%o2-24]
/* 0x00ec            */         ld      [%o0-12],%f13
/* 0x00f0            */         fsubd   %f10,%f16,%f50
/* 0x00f4            */         fmovs   %f12,%f2
/* 0x00f8            */         std     %f50,[%o2-16]
/* 0x00fc            */         ld      [%o0-8],%f3
/* 0x0100            */         fsubd   %f12,%f16,%f52
/* 0x0104            */         fmovs   %f2,%f0
/* 0x0108            */         std     %f52,[%o2-8]
/* 0x010c            */         ble,pt  %icc,.L900000405
/* 0x0110            */         ld      [%o0-4],%f1
                       .L900000408:
/* 0x0114        145 */         fsubd   %f2,%f16,%f18
/* 0x0118            */         add     %o2,16,%o2
/* 0x011c            */         cmp     %g5,%o3
/* 0x0120            */         std     %f18,[%o2-16]
/* 0x0124            */         fsubd   %f0,%f16,%f20
/* 0x0128            */         or      %g0,%o0,%o1
/* 0x012c            */         bg,pn   %icc,.L77000254
/* 0x0130            */         std     %f20,[%o2-8]
                       .L77000261:
/* 0x0134        145 */         ld      [%o1],%f15
                       .L900000409:
/* 0x0138        145 */         sethi   %hi(___const_seg_900000401+8),%o4
/* 0x013c            */         ldd     [%g4+%lo(___const_seg_900000401)],%f16
/* 0x0140            */         add     %g5,1,%g5
/* 0x0144            */         ld      [%o4+%lo(___const_seg_900000401+8)],%f14
/* 0x0148            */         add     %o1,4,%o1
/* 0x014c            */         cmp     %g5,%o3
/* 0x0150            */         fsubd   %f14,%f16,%f54
/* 0x0154            */         std     %f54,[%o2]
/* 0x0158            */         add     %o2,8,%o2
/* 0x015c            */         ble,a,pt        %icc,.L900000409
/* 0x0160            */         ld      [%o1],%f15
                       .L77000254:
/* 0x0164        145 */         retl    ! Result =
/* 0x0168            */         nop
/* 0x016c          0 */         .type   conv_i32_to_d32,2
/* 0x016c          0 */         .size   conv_i32_to_d32,(.-conv_i32_to_d32)

        .section        ".text",#alloc,#execinstr
/* 000000          0 */         .align  8
!
! CONSTANT POOL
!
                       ___const_seg_900000501:
/* 000000          0 */         .word   1127219200,0
/* 0x0008            */         .word   1127219200
/* 0x000c          0 */         .type   ___const_seg_900000501,1
/* 0x000c          0 */         .size   ___const_seg_900000501,(.-___const_seg_900000501)
/* 0x000c          0 */         .align  8
/* 0x0010            */         .skip   24
/* 0x0028            */         .align  32

!  146                !}
!  149                !void
!  150                !conv_i32_to_d16(double *d16, uint32_t *i32, int len)
!  151                !{

!
! SUBROUTINE conv_i32_to_d16
!
! OFFSET    SOURCE LINE LABEL   INSTRUCTION

                        .global conv_i32_to_d16
                       conv_i32_to_d16:
/* 000000        151 */         save    %sp,-368,%sp
/* 0x0004            */         orcc    %g0,%i2,%i2

!  152                ! int i;
!  153                ! uint32_t a;
!  155                !#pragma pipeloop(0)
!  156                ! for (i = 0; i < len; i++) {

/* 0x0008        156 */         ble,pn  %icc,.L77000272
/* 0x000c            */         sub     %i2,1,%l6
                       .L77000281:
/* 0x0010        156 */         sethi   %hi(0xfc00),%i3

!  157                !         a = i32[i];

/* 0x0014        157 */         or      %g0,%i2,%l1
/* 0x0018        156 */         add     %i3,1023,%i4
/* 0x001c        157 */         cmp     %i2,4
/* 0x0020        151 */         or      %g0,%i1,%l7
/* 0x0024            */         or      %g0,%i0,%i2
/* 0x0028        156 */         or      %g0,0,%i5
/* 0x002c            */         or      %g0,0,%i3
/* 0x0030        157 */         bl,pn   %icc,.L77000279
/* 0x0034          0 */         sethi   %hi(___const_seg_900000501),%i1
                       .L900000508:
/* 0x0038        157 */         prefetch        [%i0+8],22
/* 0x003c            */         prefetch        [%i0+72],22
/* 0x0040            */         or      %g0,%i0,%l2

!  158                !         d16[2 * i] = (double)(a & 0xffff);

/* 0x0044        158 */         sethi   %hi(___const_seg_900000501+8),%l1
/* 0x0048        157 */         prefetch        [%i0+136],22
/* 0x004c            */         sub     %l6,1,%i0
/* 0x0050            */         or      %g0,0,%i3
/* 0x0054            */         prefetch        [%i2+200],22
/* 0x0058            */         or      %g0,2,%i5
/* 0x005c            */         prefetch        [%i2+264],22
/* 0x0060            */         prefetch        [%i2+328],22
/* 0x0064            */         prefetch        [%i2+392],22
/* 0x0068            */         ld      [%l7],%l3
/* 0x006c            */         ld      [%l7+4],%l4
/* 0x0070        158 */         ldd     [%i1+%lo(___const_seg_900000501)],%f20

!  159                !         d16[2 * i + 1] = (double)(a >> 16);

/* 0x0074        159 */         srl     %l3,16,%o1
/* 0x0078        158 */         and     %l3,%i4,%o3
/* 0x007c            */         st      %o3,[%sp+2335]
/* 0x0080        159 */         srl     %l4,16,%g4
/* 0x0084        158 */         and     %l4,%i4,%o0
/* 0x0088            */         st      %o0,[%sp+2303]
/* 0x008c        159 */         add     %l7,8,%l7
/* 0x0090            */         st      %o1,[%sp+2271]
/* 0x0094            */         st      %g4,[%sp+2239]
/* 0x0098        157 */         prefetch        [%i2+456],22
/* 0x009c            */         prefetch        [%i2+520],22
                       .L900000506:
/* 0x00a0        157 */         prefetch        [%l2+536],22
/* 0x00a4        159 */         add     %i5,2,%i5
/* 0x00a8        157 */         add     %l2,32,%l2
/* 0x00ac            */         ld      [%l7],%g2
/* 0x00b0        159 */         cmp     %i5,%i0
/* 0x00b4            */         add     %l7,8,%l7
/* 0x00b8        158 */         ld      [%sp+2335],%f9
/* 0x00bc        159 */         add     %i3,4,%i3
/* 0x00c0        158 */         ld      [%l1+%lo(___const_seg_900000501+8)],%f8
/* 0x00c4        159 */         ld      [%sp+2271],%f11
/* 0x00c8        158 */         and     %g2,%i4,%g3
/* 0x00cc        159 */         fmovs   %f8,%f10
/* 0x00d0        158 */         st      %g3,[%sp+2335]
/* 0x00d4            */         fsubd   %f8,%f20,%f28
/* 0x00d8            */         std     %f28,[%l2-32]
/* 0x00dc        159 */         srl     %g2,16,%g1
/* 0x00e0            */         st      %g1,[%sp+2271]
/* 0x00e4            */         fsubd   %f10,%f20,%f30
/* 0x00e8            */         std     %f30,[%l2-24]
/* 0x00ec        157 */         ld      [%l7-4],%l0
/* 0x00f0        158 */         ld      [%sp+2303],%f13
/* 0x00f4            */         ld      [%l1+%lo(___const_seg_900000501+8)],%f12
/* 0x00f8        159 */         ld      [%sp+2239],%f15
/* 0x00fc        158 */         and     %l0,%i4,%l5
/* 0x0100        159 */         fmovs   %f12,%f14
/* 0x0104        158 */         st      %l5,[%sp+2303]
/* 0x0108            */         fsubd   %f12,%f20,%f44
/* 0x010c            */         std     %f44,[%l2-16]
/* 0x0110        159 */         srl     %l0,16,%o5
/* 0x0114            */         st      %o5,[%sp+2239]
/* 0x0118            */         fsubd   %f14,%f20,%f46
/* 0x011c            */         ble,pt  %icc,.L900000506
/* 0x0120            */         std     %f46,[%l2-8]
                       .L900000509:
/* 0x0124        158 */         ld      [%l1+%lo(___const_seg_900000501+8)],%f0
/* 0x0128        159 */         cmp     %i5,%l6
/* 0x012c            */         add     %i3,4,%i3
/* 0x0130        158 */         ld      [%sp+2335],%f1
/* 0x0134            */         ld      [%sp+2303],%f5
/* 0x0138        159 */         fmovs   %f0,%f2
/* 0x013c            */         ld      [%sp+2271],%f3
/* 0x0140        158 */         fmovs   %f0,%f4
/* 0x0144        159 */         ld      [%sp+2239],%f7
/* 0x0148            */         fmovs   %f0,%f6
/* 0x014c        158 */         fsubd   %f0,%f20,%f22
/* 0x0150            */         std     %f22,[%l2]
/* 0x0154        159 */         fsubd   %f2,%f20,%f24
/* 0x0158            */         std     %f24,[%l2+8]
/* 0x015c        158 */         fsubd   %f4,%f20,%f26
/* 0x0160            */         std     %f26,[%l2+16]
/* 0x0164        159 */         fsubd   %f6,%f20,%f20
/* 0x0168            */         bg,pn   %icc,.L77000272
/* 0x016c            */         std     %f20,[%l2+24]
                       .L77000279:
/* 0x0170        157 */         ld      [%l7],%l2
                       .L900000510:
/* 0x0174        158 */         and     %l2,%i4,%o4
/* 0x0178            */         st      %o4,[%sp+2399]
/* 0x017c        159 */         srl     %l2,16,%o2
/* 0x0180            */         st      %o2,[%sp+2367]
/* 0x0184        158 */         sethi   %hi(___const_seg_900000501+8),%l1
/* 0x0188            */         sra     %i3,0,%i0
/* 0x018c            */         ld      [%l1+%lo(___const_seg_900000501+8)],%f16
/* 0x0190            */         sllx    %i0,3,%o1
/* 0x0194        159 */         add     %i3,1,%o3
/* 0x0198        158 */         ldd     [%i1+%lo(___const_seg_900000501)],%f20
/* 0x019c        159 */         sra     %o3,0,%l3
/* 0x01a0            */         add     %i5,1,%i5
/* 0x01a4        158 */         ld      [%sp+2399],%f17
/* 0x01a8        159 */         sllx    %l3,3,%o0
/* 0x01ac            */         add     %l7,4,%l7
/* 0x01b0            */         fmovs   %f16,%f18
/* 0x01b4            */         cmp     %i5,%l6
/* 0x01b8            */         add     %i3,2,%i3
/* 0x01bc        158 */         fsubd   %f16,%f20,%f48
/* 0x01c0            */         std     %f48,[%i2+%o1]
/* 0x01c4        159 */         ld      [%sp+2367],%f19
/* 0x01c8            */         fsubd   %f18,%f20,%f50
/* 0x01cc            */         std     %f50,[%i2+%o0]
/* 0x01d0            */         ble,a,pt        %icc,.L900000510
/* 0x01d4        157 */         ld      [%l7],%l2
                       .L77000272:
/* 0x01d8        159 */         ret     ! Result =
/* 0x01dc            */         restore %g0,%g0,%g0
/* 0x01e0          0 */         .type   conv_i32_to_d16,2
/* 0x01e0          0 */         .size   conv_i32_to_d16,(.-conv_i32_to_d16)

        .section        ".text",#alloc,#execinstr
/* 000000          0 */         .align  8
!
! CONSTANT POOL
!
                       ___const_seg_900000601:
/* 000000          0 */         .word   1127219200,0
/* 0x0008            */         .word   1127219200
/* 0x000c          0 */         .type   ___const_seg_900000601,1
/* 0x000c          0 */         .size   ___const_seg_900000601,(.-___const_seg_900000601)
/* 0x000c          0 */         .align  8
/* 0x0010            */         .skip   24
/* 0x0028            */         .align  32

!  160                ! }
!  161                !}
!  163                !#ifdef RF_INLINE_MACROS
!  165                !void
!  166                !i16_to_d16_and_d32x4(const double *,     /* 1/(2^16) */
!  167                !                 const double *, /* 2^16 */
!  168                !                 const double *, /* 0 */
!  169                !                 double *,       /* result16 */
!  170                !                 double *,       /* result32 */
!  171                !                 float *);       /* source - should be unsigned int* */
!  172                !                                 /* converted to float* */
!  174                !#else
!  177                !/* ARGSUSED */
!  178                !static void
!  179                !i16_to_d16_and_d32x4(const double *dummy1,       /* 1/(2^16) */
!  180                !                 const double *dummy2,   /* 2^16 */
!  181                !                 const double *dummy3,   /* 0 */
!  182                !                 double *result16,
!  183                !                 double *result32,
!  184                !                 float *src)     /* source - should be unsigned int* */
!  185                !                                 /* converted to float* */
!  186                !{
!  187                ! uint32_t *i32;
!  188                ! uint32_t a, b, c, d;
!  190                ! i32 = (uint32_t *)src;
!  191                ! a = i32[0];
!  192                ! b = i32[1];
!  193                ! c = i32[2];
!  194                ! d = i32[3];
!  195                ! result16[0] = (double)(a & 0xffff);
!  196                ! result16[1] = (double)(a >> 16);
!  197                ! result32[0] = (double)a;
!  198                ! result16[2] = (double)(b & 0xffff);
!  199                ! result16[3] = (double)(b >> 16);
!  200                ! result32[1] = (double)b;
!  201                ! result16[4] = (double)(c & 0xffff);
!  202                ! result16[5] = (double)(c >> 16);
!  203                ! result32[2] = (double)c;
!  204                ! result16[6] = (double)(d & 0xffff);
!  205                ! result16[7] = (double)(d >> 16);
!  206                ! result32[3] = (double)d;
!  207                !}
!  209                !#endif
!  212                !void
!  213                !conv_i32_to_d32_and_d16(double *d32, double *d16, uint32_t *i32, int len)
!  214                !{

!
! SUBROUTINE conv_i32_to_d32_and_d16
!
! OFFSET    SOURCE LINE LABEL   INSTRUCTION

                        .global conv_i32_to_d32_and_d16
                       conv_i32_to_d32_and_d16:
/* 000000        214 */         save    %sp,-368,%sp

!  215                ! int i;
!  216                ! uint32_t a;
!  218                !#pragma pipeloop(0)
!  219                ! for (i = 0; i < len - 3; i += 4) {
!  220                !         i16_to_d16_and_d32x4(&TwoToMinus16, &TwoTo16, &Zero,
!  221                !                                 &(d16[2*i]), &(d32[i]),
!  222                !                                 (float *)(&(i32[i])));
!  223                ! }
!  224                ! for (; i < len; i++) {
!  225                !         a = i32[i];
!  226                !         d32[i] = (double)(i32[i]);
!  227                !         d16[2 * i] = (double)(a & 0xffff);
!  228                !         d16[2 * i + 1] = (double)(a >> 16);

/* 0x0004        228 */         sub     %i3,3,%i4
/* 0x0008        219 */         cmp     %i4,0
/* 0x000c            */         ble,pn  %icc,.L77000289
/* 0x0010            */         or      %g0,0,%i5
                       .L77000306:
/* 0x0014        222 */         sethi   %hi(Zero),%g3
/* 0x0018            */         sethi   %hi(TwoToMinus16),%g2
/* 0x001c            */         sethi   %hi(TwoTo16),%o5
/* 0x0020            */         ldd     [%g3+%lo(Zero)],%f2
/* 0x0024        219 */         sub     %i3,4,%o4
/* 0x0028            */         or      %g0,0,%o3
/* 0x002c            */         or      %g0,%i0,%l6
/* 0x0030            */         or      %g0,%i2,%l5
                       .L900000615:
/* 0x0034        222 */         fmovd   %f2,%f26
/* 0x0038            */         ld      [%l5],%f27
/* 0x003c            */         sra     %o3,0,%o0
/* 0x0040            */         add     %i5,4,%i5
/* 0x0044            */         fmovd   %f2,%f28
/* 0x0048            */         ld      [%l5+4],%f29
/* 0x004c            */         sllx    %o0,3,%g5
/* 0x0050            */         cmp     %i5,%o4
/* 0x0054            */         fmovd   %f2,%f30
/* 0x0058            */         ld      [%l5+8],%f31
/* 0x005c            */         add     %i1,%g5,%g4
/* 0x0060            */         add     %o3,8,%o3
/* 0x0064            */         ld      [%l5+12],%f3
/* 0x0068            */         fxtod   %f26,%f26
/* 0x006c            */         ldd     [%g2+%lo(TwoToMinus16)],%f32
/* 0x0070            */         fxtod   %f28,%f28
/* 0x0074            */         add     %l5,16,%l5
/* 0x0078            */         fxtod   %f30,%f30
/* 0x007c            */         ldd     [%o5+%lo(TwoTo16)],%f34
/* 0x0080            */         fxtod   %f2,%f2
/* 0x0084            */         std     %f2,[%l6+24]
/* 0x0088            */         fmuld   %f32,%f26,%f36
/* 0x008c            */         std     %f26,[%l6]
/* 0x0090            */         fmuld   %f32,%f28,%f38
/* 0x0094            */         std     %f28,[%l6+8]
/* 0x0098            */         fmuld   %f32,%f30,%f40
/* 0x009c            */         std     %f30,[%l6+16]
/* 0x00a0            */         fmuld   %f32,%f2,%f42
/* 0x00a4            */         add     %l6,32,%l6
/* 0x00a8            */         fdtox   %f36,%f36
/* 0x00ac            */         fdtox   %f38,%f38
/* 0x00b0            */         fdtox   %f40,%f40
/* 0x00b4            */         fdtox   %f42,%f42
/* 0x00b8            */         fxtod   %f36,%f36
/* 0x00bc            */         std     %f36,[%g4+8]
/* 0x00c0            */         fxtod   %f38,%f38
/* 0x00c4            */         std     %f38,[%g4+24]
/* 0x00c8            */         fxtod   %f40,%f40
/* 0x00cc            */         std     %f40,[%g4+40]
/* 0x00d0            */         fxtod   %f42,%f42
/* 0x00d4            */         std     %f42,[%g4+56]
/* 0x00d8            */         fmuld   %f36,%f34,%f36
/* 0x00dc            */         fmuld   %f38,%f34,%f38
/* 0x00e0            */         fmuld   %f40,%f34,%f40
/* 0x00e4            */         fmuld   %f42,%f34,%f42
/* 0x00e8            */         fsubd   %f26,%f36,%f36
/* 0x00ec            */         std     %f36,[%i1+%g5]
/* 0x00f0            */         fsubd   %f28,%f38,%f38
/* 0x00f4            */         std     %f38,[%g4+16]
/* 0x00f8            */         fsubd   %f30,%f40,%f40
/* 0x00fc            */         std     %f40,[%g4+32]
/* 0x0100            */         fsubd   %f2,%f42,%f42
/* 0x0104            */         std     %f42,[%g4+48]
/* 0x0108            */         ble,a,pt        %icc,.L900000615
/* 0x010c            */         ldd     [%g3+%lo(Zero)],%f2
                       .L77000289:
/* 0x0110        224 */         cmp     %i5,%i3
/* 0x0114            */         bge,pn  %icc,.L77000294
/* 0x0118            */         sethi   %hi(0xfc00),%l0
                       .L77000307:
/* 0x011c        224 */         sra     %i5,0,%l2
/* 0x0120            */         sll     %i5,1,%i4
/* 0x0124            */         sllx    %l2,3,%l1
/* 0x0128            */         sllx    %l2,2,%o1
/* 0x012c        225 */         sub     %i3,%i5,%l3
/* 0x0130        224 */         add     %l0,1023,%l0
/* 0x0134            */         add     %l1,%i0,%l1
/* 0x0138            */         add     %o1,%i2,%i2
/* 0x013c        225 */         cmp     %l3,5
/* 0x0140            */         bl,pn   %icc,.L77000291
/* 0x0144          0 */         sethi   %hi(___const_seg_900000601),%l7
                       .L900000612:
/* 0x0148        225 */         prefetch        [%l1],22
/* 0x014c            */         prefetch        [%l1+64],22
/* 0x0150            */         sra     %i4,0,%l6
/* 0x0154        226 */         sethi   %hi(___const_seg_900000601+8),%l2
/* 0x0158        225 */         prefetch        [%l1+128],22
/* 0x015c            */         add     %l6,-2,%l5
/* 0x0160            */         sub     %i3,3,%i0
/* 0x0164            */         prefetch        [%l1+192],22
/* 0x0168            */         sllx    %l5,3,%o4
/* 0x016c        228 */         add     %i5,1,%i5
/* 0x0170        225 */         add     %i1,%o4,%o3
/* 0x0174            */         or      %g0,%i3,%g1
/* 0x0178            */         ld      [%i2],%l4
/* 0x017c            */         prefetch        [%o3+16],22
/* 0x0180            */         add     %o3,16,%l3
/* 0x0184        228 */         add     %i2,4,%i2
/* 0x0188        225 */         prefetch        [%o3+80],22
/* 0x018c        228 */         srl     %l4,16,%o1
/* 0x0190        227 */         and     %l4,%l0,%o0
/* 0x0194        225 */         prefetch        [%o3+144],22
/* 0x0198        228 */         st      %o1,[%sp+2271]
/* 0x019c        227 */         st      %o0,[%sp+2239]
/* 0x01a0        226 */         ldd     [%l7+%lo(___const_seg_900000601)],%f32
/* 0x01a4        228 */         ld      [%l2+%lo(___const_seg_900000601+8)],%f0
/* 0x01a8        225 */         prefetch        [%o3+208],22
/* 0x01ac            */         prefetch        [%o3+272],22
/* 0x01b0            */         prefetch        [%o3+336],22
                       .L900000610:
/* 0x01b4        225 */         prefetch        [%l1+192],22
/* 0x01b8        228 */         add     %i5,4,%i5
/* 0x01bc        225 */         add     %l3,64,%l3
/* 0x01c0        227 */         ld      [%l2+%lo(___const_seg_900000601+8)],%f8
/* 0x01c4        228 */         cmp     %i5,%i0
/* 0x01c8        225 */         ld      [%i2],%g5
/* 0x01cc        228 */         add     %i2,16,%i2
/* 0x01d0            */         add     %l1,32,%l1
/* 0x01d4            */         add     %i4,8,%i4
/* 0x01d8        226 */         ld      [%i2-20],%f7
/* 0x01dc        228 */         srl     %g5,16,%i3
/* 0x01e0        226 */         fmovs   %f8,%f6
/* 0x01e4        228 */         st      %i3,[%sp+2335]
/* 0x01e8        227 */         and     %g5,%l0,%g4
/* 0x01ec            */         st      %g4,[%sp+2303]
/* 0x01f0        226 */         fsubd   %f6,%f32,%f40
/* 0x01f4        227 */         ld      [%sp+2239],%f9
/* 0x01f8        228 */         ld      [%sp+2271],%f1
/* 0x01fc            */         fmovs   %f8,%f12
/* 0x0200        226 */         std     %f40,[%l1-32]
/* 0x0204        227 */         fsubd   %f8,%f32,%f42
/* 0x0208            */         std     %f42,[%l3-64]
/* 0x020c        228 */         fsubd   %f0,%f32,%f44
/* 0x0210            */         std     %f44,[%l3-56]
/* 0x0214        227 */         fmovs   %f12,%f10
/* 0x0218        225 */         ld      [%i2-12],%g2
/* 0x021c        226 */         ld      [%i2-16],%f1
/* 0x0220        228 */         srl     %g2,16,%g3
/* 0x0224        226 */         fmovs   %f12,%f0
/* 0x0228        225 */         prefetch        [%l3+320],22
/* 0x022c        228 */         st      %g3,[%sp+2271]
/* 0x0230        227 */         and     %g2,%l0,%l6
/* 0x0234            */         st      %l6,[%sp+2239]
/* 0x0238        226 */         fsubd   %f0,%f32,%f46
/* 0x023c        227 */         ld      [%sp+2303],%f11
/* 0x0240        228 */         ld      [%sp+2335],%f13
/* 0x0244            */         fmovs   %f12,%f18
/* 0x0248        226 */         std     %f46,[%l1-24]
/* 0x024c        227 */         fsubd   %f10,%f32,%f48
/* 0x0250            */         std     %f48,[%l3-48]
/* 0x0254        228 */         fsubd   %f12,%f32,%f50
/* 0x0258            */         std     %f50,[%l3-40]
/* 0x025c        227 */         fmovs   %f18,%f16
/* 0x0260        225 */         ld      [%i2-8],%o5
/* 0x0264        226 */         ld      [%i2-12],%f15
/* 0x0268        228 */         srl     %o5,16,%l5
/* 0x026c        226 */         fmovs   %f18,%f14
/* 0x0270        228 */         st      %l5,[%sp+2335]
/* 0x0274        227 */         and     %o5,%l0,%o4
/* 0x0278            */         st      %o4,[%sp+2303]
/* 0x027c        226 */         fsubd   %f14,%f32,%f52
/* 0x0280        227 */         ld      [%sp+2239],%f17
/* 0x0284        228 */         ld      [%sp+2271],%f19
/* 0x0288        225 */         prefetch        [%l3+352],22
/* 0x028c        228 */         fmovs   %f18,%f24
/* 0x0290        226 */         std     %f52,[%l1-16]
/* 0x0294        227 */         fsubd   %f16,%f32,%f54
/* 0x0298            */         std     %f54,[%l3-32]
/* 0x029c        228 */         fsubd   %f18,%f32,%f56
/* 0x02a0            */         std     %f56,[%l3-24]
/* 0x02a4        227 */         fmovs   %f24,%f22
/* 0x02a8        225 */         ld      [%i2-4],%l4
/* 0x02ac        226 */         ld      [%i2-8],%f21
/* 0x02b0        228 */         srl     %l4,16,%o3
/* 0x02b4        226 */         fmovs   %f24,%f20
/* 0x02b8        228 */         st      %o3,[%sp+2271]
/* 0x02bc        227 */         and     %l4,%l0,%o2
/* 0x02c0            */         st      %o2,[%sp+2239]
/* 0x02c4        226 */         fsubd   %f20,%f32,%f58
/* 0x02c8        227 */         ld      [%sp+2303],%f23
/* 0x02cc        228 */         ld      [%sp+2335],%f25
/* 0x02d0            */         fmovs   %f24,%f0
/* 0x02d4        226 */         std     %f58,[%l1-8]
/* 0x02d8        227 */         fsubd   %f22,%f32,%f60
/* 0x02dc            */         std     %f60,[%l3-16]
/* 0x02e0        228 */         fsubd   %f24,%f32,%f62
/* 0x02e4            */         bl,pt   %icc,.L900000610
/* 0x02e8            */         std     %f62,[%l3-8]
                       .L900000613:
/* 0x02ec        227 */         ld      [%l2+%lo(___const_seg_900000601+8)],%f4
/* 0x02f0        228 */         add     %l1,8,%l1
/* 0x02f4            */         cmp     %i5,%g1
/* 0x02f8        226 */         ld      [%i2-4],%f3
/* 0x02fc        225 */         or      %g0,%g1,%i3
/* 0x0300        228 */         add     %i4,2,%i4
/* 0x0304        227 */         ld      [%sp+2239],%f5
/* 0x0308        226 */         fmovs   %f4,%f2
/* 0x030c        228 */         ld      [%sp+2271],%f1
/* 0x0310        226 */         fsubd   %f2,%f32,%f34
/* 0x0314            */         std     %f34,[%l1-8]
/* 0x0318        227 */         fsubd   %f4,%f32,%f36
/* 0x031c            */         std     %f36,[%l3]
/* 0x0320        228 */         fsubd   %f0,%f32,%f38
/* 0x0324            */         bge,pn  %icc,.L77000294
/* 0x0328            */         std     %f38,[%l3+8]
                       .L77000291:
/* 0x032c        225 */         ld      [%i2],%o2
                       .L900000614:
/* 0x0330        226 */         ldd     [%l7+%lo(___const_seg_900000601)],%f32
/* 0x0334        228 */         srl     %o2,16,%l3
/* 0x0338        227 */         sra     %i4,0,%i0
/* 0x033c        228 */         st      %l3,[%sp+2367]
/* 0x0340        227 */         and     %o2,%l0,%g1
/* 0x0344        226 */         sethi   %hi(___const_seg_900000601+8),%l2
/* 0x0348        227 */         st      %g1,[%sp+2399]
/* 0x034c            */         sllx    %i0,3,%o0
/* 0x0350        228 */         add     %i4,1,%l4
/* 0x0354        226 */         ld      [%l2+%lo(___const_seg_900000601+8)],%f4
/* 0x0358        228 */         sra     %l4,0,%o1
/* 0x035c            */         add     %i5,1,%i5
/* 0x0360        226 */         ld      [%i2],%f5
/* 0x0364        228 */         sllx    %o1,3,%g5
/* 0x0368            */         cmp     %i5,%i3
/* 0x036c            */         ld      [%sp+2367],%f9
/* 0x0370            */         add     %i2,4,%i2
/* 0x0374            */         add     %i4,2,%i4
/* 0x0378        227 */         fmovs   %f4,%f6
/* 0x037c        226 */         fsubd   %f4,%f32,%f44
/* 0x0380            */         std     %f44,[%l1]
/* 0x0384        227 */         ld      [%sp+2399],%f7
/* 0x0388        228 */         fmovs   %f6,%f8
/* 0x038c            */         add     %l1,8,%l1
/* 0x0390            */         fsubd   %f8,%f32,%f48
/* 0x0394        227 */         fsubd   %f6,%f32,%f46
/* 0x0398            */         std     %f46,[%i1+%o0]
/* 0x039c        228 */         std     %f48,[%i1+%g5]
/* 0x03a0            */         bl,a,pt %icc,.L900000614
/* 0x03a4        225 */         ld      [%i2],%o2
                       .L77000294:
/* 0x03a8        222 */         ret     ! Result =
/* 0x03ac            */         restore %g0,%g0,%g0
/* 0x03b0          0 */         .type   conv_i32_to_d32_and_d16,2
/* 0x03b0          0 */         .size   conv_i32_to_d32_and_d16,(.-conv_i32_to_d32_and_d16)

        .section        ".text",#alloc,#execinstr
/* 000000          0 */         .align  32

!  229                ! }
!  230                !}
!  232                !extern long long c1, c2, c3, c4;
!  234                !static void
!  235                !adjust_montf_result(uint32_t *i32, uint32_t *nint, int len)
!  236                !{

!
! SUBROUTINE adjust_montf_result
!
! OFFSET    SOURCE LINE LABEL   INSTRUCTION

                       adjust_montf_result:
/* 000000        236 */         sra     %o2,0,%g2
/* 0x0004            */         or      %g0,%o0,%o4

!  237                ! int64_t acc;
!  238                ! int i;
!  240                ! if (i32[len] > 0) {

/* 0x0008        240 */         sllx    %g2,2,%g3
/* 0x000c            */         ld      [%o0+%g3],%o0
/* 0x0010            */         cmp     %o0,0
/* 0x0014            */         bleu,pn %icc,.L77000316
/* 0x0018        236 */         or      %g0,%o1,%o5

!  241                !         i = -1;

                       .L77000315:
/* 0x001c        241 */         sub     %g2,1,%g3
/* 0x0020            */         ba      .L900000712
/* 0x0024        249 */         cmp     %g2,0

!  242                ! } else {
!  243                !         for (i = len - 1; i >= 0; i--) {

                       .L77000316:
/* 0x0028        243 */         subcc   %g2,1,%g3
/* 0x002c            */         bneg,pn %icc,.L77000340
/* 0x0030            */         or      %g0,%g3,%o3
                       .L77000348:
/* 0x0034        243 */         sra     %g3,0,%o1
/* 0x0038            */         sllx    %o1,2,%g1

!  244                !                 if (i32[i] != nint[i]) break;

/* 0x003c        244 */         ld      [%g1+%o5],%g4
/* 0x0040        243 */         add     %g1,%o4,%o2
/* 0x0044            */         add     %g1,%o5,%o1
                       .L900000713:
/* 0x0048        244 */         ld      [%o2],%o0
/* 0x004c            */         cmp     %o0,%g4
/* 0x0050            */         bne,pn  %icc,.L77000324
/* 0x0054            */         sub     %o2,4,%o2
                       .L77000320:
/* 0x0058        244 */         sub     %o1,4,%o1
/* 0x005c            */         subcc   %o3,1,%o3
/* 0x0060            */         bpos,a,pt       %icc,.L900000713
/* 0x0064            */         ld      [%o1],%g4
                       .L900000706:
/* 0x0068        244 */         ba      .L900000712
/* 0x006c        249 */         cmp     %g2,0
                       .L77000324:
/* 0x0070        244 */         sra     %o3,0,%o0
/* 0x0074            */         sllx    %o0,2,%g1
/* 0x0078            */         ld      [%o5+%g1],%o3
/* 0x007c            */         ld      [%o4+%g1],%g5
/* 0x0080            */         cmp     %g5,%o3
/* 0x0084            */         bleu,pt %icc,.L77000332
/* 0x0088            */         nop

!  245                !         }
!  246                ! }
!  247                ! if ((i < 0) || (i32[i] > nint[i])) {
!  248                !         acc = 0;
!  249                !         for (i = 0; i < len; i++) {

                       .L77000340:
/* 0x008c        249 */         cmp     %g2,0
                       .L900000712:
/* 0x0090        249 */         ble,pn  %icc,.L77000332
/* 0x0094        250 */         or      %g0,%g2,%o3
                       .L77000347:
/* 0x0098        249 */         or      %g0,0,%o0

!  250                !                 acc = acc + (uint64_t)(i32[i]) - (uint64_t)(nint[i]);

/* 0x009c        250 */         cmp     %o3,10
/* 0x00a0            */         bl,pn   %icc,.L77000341
/* 0x00a4        249 */         or      %g0,0,%g2
                       .L900000709:
/* 0x00a8        250 */         prefetch        [%o4],22
/* 0x00ac            */         prefetch        [%o4+64],22

!  251                !                 i32[i] = acc & 0xffffffff;
!  252                !                 acc = acc >> 32;

/* 0x00b0        252 */         add     %o5,4,%o1
/* 0x00b4            */         add     %o4,8,%o2
/* 0x00b8        250 */         prefetch        [%o4+128],22
/* 0x00bc            */         sub     %o3,8,%o5
/* 0x00c0            */         or      %g0,2,%o0
/* 0x00c4            */         prefetch        [%o4+192],22
/* 0x00c8            */         prefetch        [%o4+256],22
/* 0x00cc            */         prefetch        [%o4+320],22
/* 0x00d0            */         prefetch        [%o4+384],22
/* 0x00d4            */         ld      [%o2-4],%g5
/* 0x00d8            */         prefetch        [%o2+440],22
/* 0x00dc            */         prefetch        [%o2+504],22
/* 0x00e0            */         ld      [%o4],%g4
/* 0x00e4            */         ld      [%o1-4],%o4
/* 0x00e8            */         sub     %g4,%o4,%o3
/* 0x00ec        251 */         st      %o3,[%o2-8]
/* 0x00f0        252 */         srax    %o3,32,%g4
                       .L900000707:
/* 0x00f4        252 */         add     %o0,8,%o0
/* 0x00f8            */         add     %o2,32,%o2
/* 0x00fc        250 */         ld      [%o1],%g1
/* 0x0100            */         prefetch        [%o2+496],22
/* 0x0104        252 */         cmp     %o0,%o5
/* 0x0108            */         add     %o1,32,%o1
/* 0x010c        250 */         sub     %g5,%g1,%g5
/* 0x0110            */         add     %g5,%g4,%o4
/* 0x0114            */         ld      [%o2-32],%g4
/* 0x0118        251 */         st      %o4,[%o2-36]
/* 0x011c        252 */         srax    %o4,32,%g1
/* 0x0120        250 */         ld      [%o1-28],%o3
/* 0x0124            */         sub     %g4,%o3,%g2
/* 0x0128            */         add     %g2,%g1,%g5
/* 0x012c            */         ld      [%o2-28],%o3
/* 0x0130        251 */         st      %g5,[%o2-32]
/* 0x0134        252 */         srax    %g5,32,%g4
/* 0x0138        250 */         ld      [%o1-24],%o4
/* 0x013c            */         sub     %o3,%o4,%g1
/* 0x0140            */         add     %g1,%g4,%g2
/* 0x0144            */         ld      [%o2-24],%o3
/* 0x0148        251 */         st      %g2,[%o2-28]
/* 0x014c        252 */         srax    %g2,32,%g5
/* 0x0150        250 */         ld      [%o1-20],%o4
/* 0x0154            */         sub     %o3,%o4,%g4
/* 0x0158            */         add     %g4,%g5,%g1
/* 0x015c            */         ld      [%o2-20],%o4
/* 0x0160        251 */         st      %g1,[%o2-24]
/* 0x0164        252 */         srax    %g1,32,%o3
/* 0x0168        250 */         ld      [%o1-16],%g2
/* 0x016c            */         sub     %o4,%g2,%g5
/* 0x0170            */         add     %g5,%o3,%g1
/* 0x0174            */         ld      [%o2-16],%g4
/* 0x0178        251 */         st      %g1,[%o2-20]
/* 0x017c        252 */         srax    %g1,32,%o4
/* 0x0180        250 */         ld      [%o1-12],%g2
/* 0x0184            */         sub     %g4,%g2,%o3
/* 0x0188            */         add     %o3,%o4,%g5
/* 0x018c            */         ld      [%o2-12],%g2
/* 0x0190        251 */         st      %g5,[%o2-16]
/* 0x0194        252 */         srax    %g5,32,%g4
/* 0x0198        250 */         ld      [%o1-8],%g1
/* 0x019c            */         sub     %g2,%g1,%o4
/* 0x01a0            */         add     %o4,%g4,%o3
/* 0x01a4            */         ld      [%o2-8],%g2
/* 0x01a8        251 */         st      %o3,[%o2-12]
/* 0x01ac        252 */         srax    %o3,32,%g5
/* 0x01b0        250 */         ld      [%o1-4],%g1
/* 0x01b4            */         sub     %g2,%g1,%g4
/* 0x01b8            */         add     %g4,%g5,%o4
/* 0x01bc            */         ld      [%o2-4],%g5
/* 0x01c0        251 */         st      %o4,[%o2-8]
/* 0x01c4        252 */         ble,pt  %icc,.L900000707
/* 0x01c8            */         srax    %o4,32,%g4
                       .L900000710:
/* 0x01cc        250 */         ld      [%o1],%o3
/* 0x01d0        252 */         add     %o1,4,%o5
/* 0x01d4        250 */         or      %g0,%o2,%o4
/* 0x01d8        252 */         cmp     %o0,%g3
/* 0x01dc        250 */         sub     %g5,%o3,%g2
/* 0x01e0            */         add     %g2,%g4,%g1
/* 0x01e4        251 */         st      %g1,[%o2-4]
/* 0x01e8        252 */         bg,pn   %icc,.L77000332
/* 0x01ec            */         srax    %g1,32,%g2
                       .L77000341:
/* 0x01f0        250 */         ld      [%o4],%g5
                       .L900000711:
/* 0x01f4        250 */         ld      [%o5],%o2
/* 0x01f8            */         add     %g2,%g5,%g4
/* 0x01fc        252 */         add     %o0,1,%o0
/* 0x0200            */         cmp     %o0,%g3
/* 0x0204            */         add     %o5,4,%o5
/* 0x0208        250 */         sub     %g4,%o2,%o1
/* 0x020c        251 */         st      %o1,[%o4]
/* 0x0210        252 */         srax    %o1,32,%g2
/* 0x0214            */         add     %o4,4,%o4
/* 0x0218            */         ble,a,pt        %icc,.L900000711
/* 0x021c        250 */         ld      [%o4],%g5
                       .L77000332:
/* 0x0220        252 */         retl    ! Result =
/* 0x0224            */         nop
/* 0x0228          0 */         .type   adjust_montf_result,2
/* 0x0228          0 */         .size   adjust_montf_result,(.-adjust_montf_result)

        .section        ".text",#alloc,#execinstr
/* 000000          0 */         .align  32

!  253                !         }
!  254                ! }
!  255                !}
!  257                !/*************
!  258                !static void
!  259                !adjust_montf_result_bad(uint32_t *i32, uint32_t *nint, int len)
!  260                !{
!  261                ! int64_t acc;
!  262                ! int i;
!  264                ! c4++;
!  265                !
!  266                ! if (i32[len] > 0) {
!  267                !         i = -1;
!  268                !         c1++;
!  269                ! } else {
!  270                !         for (i = len - 1; i >= 0; i++) {
!  271                !                 if (i32[i] != nint[i]) break;
!  272                !                 c2++;
!  273                !         }
!  274                ! }
!  275                ! if ((i < 0) || (i32[i] > nint[i])) {
!  276                !         c3++;
!  277                !         acc = 0;
!  278                !         for (i = 0; i < len; i++) {
!  279                !                 acc = acc + (uint64_t)(i32[i]) - (uint64_t)(nint[i]);
!  280                !                 i32[i] = acc & 0xffffffff;
!  281                !                 acc = acc >> 32;
!  282                !         }
!  283                ! }
!  284                !}
!  285                !uint32_t saveresult[1000];
!  286                !void printarray(char *name, uint32_t *arr, int len)
!  287                !{
!  288                ! int i, j;
!  289                ! uint64_t tmp;
!  291                ! printf("uint64_t %s[%d] =\n{\n",name,(len+1)/2);
!  292                ! for(i=j=0; i<len; i+=2,j+=2){
!  293                !         if(j == 6){
!  294                !                 printf("\n");
!  295                !                 j=0;
!  296                !         }
!  297                !         tmp = (((uint64_t)arr[i])<<32) | ((uint64_t)arr[i+1]);
!  298                !         printf("0x%016llx",tmp);
!  299                !         if((i/2)!=(((len+1)/2)-1))printf(",");
!  300                !         if(j!=4)printf(" ");
!  301                ! }
!  302                ! if(j!=0) printf("\n");
!  303                ! printf("};\n");
!  304                !}
!  305                !**************/
!  308                !/*
!  309                ! * the lengths of the input arrays should be at least the following:
!  310                ! * result[nlen+1], dm1[nlen], dm2[2*nlen+1], dt[4*nlen+2], dn[nlen], nint[nlen]
!  311                ! * all of them should be different from one another
!  312                ! */
!  313                !void mont_mulf_noconv(uint32_t *result,
!  314                !                 double *dm1, double *dm2, double *dt,
!  315                !                 double *dn, uint32_t *nint,
!  316                !                 int nlen, double dn0)
!  317                !{

!
! SUBROUTINE mont_mulf_noconv
!
! OFFSET    SOURCE LINE LABEL   INSTRUCTION

                        .global mont_mulf_noconv
                       mont_mulf_noconv:
/* 000000        317 */         save    %sp,-176,%sp
/* 0x0004            */         ldx     [%fp+2223],%g1
/* 0x0008          0 */         sethi   %hi(Zero),%l5
/* 0x000c        317 */         or      %g0,%i2,%l0

!  318                ! int i, j, jj;
!  319                ! double digit, m2j, a, b;
!  320                ! double *pdm1, *pdm2, *pdn, *pdtj, pdn_0, pdm1_0;
!  322                ! pdm1 = &(dm1[0]);
!  323                ! pdm2 = &(dm2[0]);
!  324                ! pdn = &(dn[0]);
!  325                ! pdm2[2 * nlen] = Zero;

/* 0x0010        325 */         ldd     [%l5+%lo(Zero)],%f0
/* 0x0014        317 */         or      %g0,%i0,%i2
/* 0x0018        325 */         sll     %g1,1,%o3

!  327                ! if (nlen != 16) {

/* 0x001c        327 */         cmp     %g1,16
/* 0x0020        325 */         sra     %o3,0,%i0
/* 0x0024            */         sllx    %i0,3,%o0
/* 0x0028        317 */         or      %g0,%i5,%i0
/* 0x002c        327 */         bne,pn  %icc,.L77000476
/* 0x0030        325 */         std     %f0,[%l0+%o0]
                       .L77000488:
/* 0x0034          0 */         sethi   %hi(TwoToMinus16),%o2
/* 0x0038          0 */         sethi   %hi(TwoTo16),%l3

!  328                !         for (i = 0; i < 4 * nlen + 2; i++)
!  329                !                 dt[i] = Zero;
!  330                !         a = dt[0] = pdm1[0] * pdm2[0];
!  331                !         digit = mod(lower32(a, Zero) * dn0, TwoToMinus16, TwoTo16);
!  333                !         pdtj = &(dt[0]);
!  334                !         for (j = jj = 0; j < 2 * nlen; j++, jj++, pdtj++) {
!  335                !                 m2j = pdm2[j];
!  336                !                 a = pdtj[0] + pdn[0] * digit;
!  337                !                 b = pdtj[1] + pdm1[0] * pdm2[j + 1] + a * TwoToMinus16;
!  338                !                 pdtj[1] = b;
!  340                !#pragma pipeloop(0)
!  341                !                 for (i = 1; i < nlen; i++) {
!  342                !                         pdtj[2 * i] += pdm1[i] * m2j + pdn[i] * digit;
!  343                !                 }
!  344                !                 if (jj == 15) {
!  345                !                         cleanup(dt, j / 2 + 1, 2 * nlen + 1);
!  346                !                         jj = 0;
!  347                !                 }
!  349                !                 digit = mod(lower32(b, Zero) * dn0,
!  350                !                             TwoToMinus16, TwoTo16);
!  351                !         }
!  352                ! } else {
!  353                !         a = dt[0] = pdm1[0] * pdm2[0];

/* 0x003c        353 */         ldd     [%i1],%f40

!  355                !         dt[65] = dt[64] = dt[63] = dt[62] = dt[61] = dt[60] =
!  356                !                 dt[59] = dt[58] = dt[57] = dt[56] = dt[55] =
!  357                !                 dt[54] = dt[53] = dt[52] = dt[51] = dt[50] =
!  358                !                 dt[49] = dt[48] = dt[47] = dt[46] = dt[45] =
!  359                !                 dt[44] = dt[43] = dt[42] = dt[41] = dt[40] =
!  360                !                 dt[39] = dt[38] = dt[37] = dt[36] = dt[35] =
!  361                !                 dt[34] = dt[33] = dt[32] = dt[31] = dt[30] =
!  362                !                 dt[29] = dt[28] = dt[27] = dt[26] = dt[25] =
!  363                !                 dt[24] = dt[23] = dt[22] = dt[21] = dt[20] =
!  364                !                 dt[19] = dt[18] = dt[17] = dt[16] = dt[15] =
!  365                !                 dt[14] = dt[13] = dt[12] = dt[11] = dt[10] =
!  366                !                 dt[9] = dt[8] = dt[7] = dt[6] = dt[5] = dt[4] =
!  367                !                 dt[3] = dt[2] = dt[1] = Zero;
!  369                !         pdn_0 = pdn[0];
!  370                !         pdm1_0 = pdm1[0];
!  372                !         digit = mod(lower32(a, Zero) * dn0, TwoToMinus16, TwoTo16);
!  373                !         pdtj = &(dt[0]);

/* 0x0040        373 */         or      %g0,%i3,%o3

!  375                !         for (j = 0; j < 32; j++, pdtj++) {

/* 0x0044        375 */         or      %g0,0,%l1
/* 0x0048        353 */         ldd     [%l0],%f42
/* 0x004c        372 */         ldd     [%o2+%lo(TwoToMinus16)],%f44
/* 0x0050            */         ldd     [%l3+%lo(TwoTo16)],%f46
/* 0x0054        367 */         std     %f0,[%i3+8]
/* 0x0058        353 */         fmuld   %f40,%f42,%f38
/* 0x005c            */         std     %f38,[%i3]
/* 0x0060        367 */         std     %f0,[%i3+16]
/* 0x0064            */         std     %f0,[%i3+24]
/* 0x0068            */         std     %f0,[%i3+32]
/* 0x006c        372 */         fdtox   %f38,%f4
/* 0x0070        367 */         std     %f0,[%i3+40]
/* 0x0074            */         std     %f0,[%i3+48]
/* 0x0078            */         std     %f0,[%i3+56]
/* 0x007c        372 */         fmovs   %f0,%f4
/* 0x0080        367 */         std     %f0,[%i3+64]
/* 0x0084            */         std     %f0,[%i3+72]
/* 0x0088        372 */         fxtod   %f4,%f52
/* 0x008c        367 */         std     %f0,[%i3+80]
/* 0x0090            */         std     %f0,[%i3+88]
/* 0x0094            */         std     %f0,[%i3+96]
/* 0x0098            */         std     %f0,[%i3+104]
/* 0x009c        372 */         fmuld   %f52,%f14,%f60
/* 0x00a0        367 */         std     %f0,[%i3+112]
/* 0x00a4            */         std     %f0,[%i3+120]
/* 0x00a8            */         std     %f0,[%i3+128]
/* 0x00ac            */         std     %f0,[%i3+136]
/* 0x00b0        372 */         fmuld   %f60,%f44,%f62
/* 0x00b4        367 */         std     %f0,[%i3+144]
/* 0x00b8            */         std     %f0,[%i3+152]
/* 0x00bc            */         std     %f0,[%i3+160]
/* 0x00c0            */         std     %f0,[%i3+168]
/* 0x00c4        372 */         fdtox   %f62,%f32
/* 0x00c8        367 */         std     %f0,[%i3+176]
/* 0x00cc            */         std     %f0,[%i3+184]
/* 0x00d0            */         std     %f0,[%i3+192]
/* 0x00d4            */         std     %f0,[%i3+200]
/* 0x00d8        372 */         fxtod   %f32,%f50
/* 0x00dc        367 */         std     %f0,[%i3+208]
/* 0x00e0            */         std     %f0,[%i3+216]
/* 0x00e4            */         std     %f0,[%i3+224]
/* 0x00e8            */         std     %f0,[%i3+232]
/* 0x00ec        372 */         fmuld   %f50,%f46,%f34
/* 0x00f0        367 */         std     %f0,[%i3+240]
/* 0x00f4            */         std     %f0,[%i3+248]
/* 0x00f8            */         std     %f0,[%i3+256]
/* 0x00fc            */         std     %f0,[%i3+264]
/* 0x0100        372 */         fsubd   %f60,%f34,%f40
/* 0x0104        367 */         std     %f0,[%i3+272]
/* 0x0108            */         std     %f0,[%i3+280]
/* 0x010c            */         std     %f0,[%i3+288]
/* 0x0110            */         std     %f0,[%i3+296]
/* 0x0114            */         std     %f0,[%i3+304]
/* 0x0118            */         std     %f0,[%i3+312]
/* 0x011c            */         std     %f0,[%i3+320]
/* 0x0120            */         std     %f0,[%i3+328]
/* 0x0124            */         std     %f0,[%i3+336]
/* 0x0128            */         std     %f0,[%i3+344]
/* 0x012c            */         std     %f0,[%i3+352]
/* 0x0130            */         std     %f0,[%i3+360]
/* 0x0134            */         std     %f0,[%i3+368]
/* 0x0138        375 */         sub     %g1,1,%l3
/* 0x013c            */         add     %i3,8,%o7
/* 0x0140        367 */         std     %f0,[%i3+376]
/* 0x0144            */         std     %f0,[%i3+384]
/* 0x0148            */         std     %f0,[%i3+392]
/* 0x014c            */         std     %f0,[%i3+400]
/* 0x0150            */         std     %f0,[%i3+408]
/* 0x0154            */         std     %f0,[%i3+416]
/* 0x0158            */         std     %f0,[%i3+424]
/* 0x015c            */         std     %f0,[%i3+432]
/* 0x0160            */         std     %f0,[%i3+440]
/* 0x0164            */         std     %f0,[%i3+448]
/* 0x0168            */         std     %f0,[%i3+456]
/* 0x016c            */         std     %f0,[%i3+464]
/* 0x0170            */         std     %f0,[%i3+472]
/* 0x0174            */         std     %f0,[%i3+480]
/* 0x0178            */         std     %f0,[%i3+488]
/* 0x017c            */         std     %f0,[%i3+496]
/* 0x0180            */         std     %f0,[%i3+504]
/* 0x0184            */         std     %f0,[%i3+512]
/* 0x0188            */         std     %f0,[%i3+520]

!BEGIN HAND CODED PART

! cheetah schedule, no even-odd trick


        add     %i3,%g0,%o5

        fmovd   %f40,%f0
        fmovd   %f14,%f2
        fmovd   %f44,%f8
        sethi   %hi(TwoTo32),%l5
        fmovd   %f46,%f10
        sethi   %hi(TwoToMinus32),%g5
        ldd     [%i3],%f6
        ldd     [%l0],%f4

        ldd     [%i1],%f40
        ldd     [%i1+8],%f42
        ldd     [%i1+16],%f52
        ldd     [%i1+48],%f54
        ldd     [%i1+56],%f36
        ldd     [%i1+64],%f56
        ldd     [%i1+104],%f48
        ldd     [%i1+112],%f58

        ldd     [%i4],%f44
        ldd     [%i4+8],%f46
        ldd     [%i4+104],%f50
        ldd     [%i4+112],%f60


        .L99999999:
!1
        ldd     [%i1+24],%f20
        fmuld   %f0,%f44,%f12
!2
        ldd     [%i4+24],%f22
        fmuld   %f42,%f4,%f16
!3
        ldd     [%i1+40],%f24
        fmuld   %f46,%f0,%f18
!4
        ldd     [%i4+40],%f26
        fmuld   %f20,%f4,%f20
!5
        ldd     [%l0+8],%f38
        faddd   %f12,%f6,%f12
        fmuld   %f22,%f0,%f22
!6
        add     %l0,8,%l0
        ldd     [%i4+56],%f30
        fmuld   %f24,%f4,%f24
!7
        ldd     [%i1+72],%f32
        faddd   %f16,%f18,%f16
        fmuld   %f26,%f0,%f26
!8
        ldd     [%i3+16],%f18
        fmuld   %f40,%f38,%f14
!9
        ldd     [%i4+72],%f34
        faddd   %f20,%f22,%f20
        fmuld   %f8,%f12,%f12
!10
        ldd     [%i3+48],%f22
        fmuld   %f36,%f4,%f28
!11
        ldd     [%i3+8],%f6
        faddd   %f16,%f18,%f16
        fmuld   %f30,%f0,%f30
!12
        std     %f16,[%i3+16]
        faddd   %f24,%f26,%f24
        fmuld   %f32,%f4,%f32
!13
        ldd     [%i3+80],%f26
        faddd   %f12,%f14,%f12
        fmuld   %f34,%f0,%f34
!14
        ldd     [%i1+88],%f16
        faddd   %f20,%f22,%f20
!15
        ldd     [%i4+88],%f18
        faddd   %f28,%f30,%f28
!16
        ldd     [%i3+112],%f30
        faddd   %f32,%f34,%f32
!17
        ldd     [%i3+144],%f34
        faddd   %f12,%f6,%f6
        fmuld   %f16,%f4,%f16
!18
        std     %f20,[%i3+48]
        faddd   %f24,%f26,%f24
        fmuld   %f18,%f0,%f18
!19
        std     %f24,[%i3+80]
        faddd   %f28,%f30,%f28
        fmuld   %f48,%f4,%f20
!20
        std     %f28,[%i3+112]
        faddd   %f32,%f34,%f32
        fmuld   %f50,%f0,%f22
!21
        ldd     [%i1+120],%f24
        fdtox   %f6,%f12
!22
        std     %f32,[%i3+144]
        faddd   %f16,%f18,%f16
!23
        ldd     [%i4+120],%f26
!24
        ldd     [%i3+176],%f18
        faddd   %f20,%f22,%f20
        fmuld   %f24,%f4,%f24
!25
        ldd     [%i4+16],%f30
        fmovs   %f11,%f12
!26
        ldd     [%i1+32],%f32
        fmuld   %f26,%f0,%f26
!27
        ldd     [%i4+32],%f34
        fmuld   %f52,%f4,%f28
!28
        ldd     [%i3+208],%f22
        faddd   %f16,%f18,%f16
        fmuld   %f30,%f0,%f30
!29
        std     %f16,[%i3+176]
        fxtod   %f12,%f12
        fmuld   %f32,%f4,%f32
!30
        ldd     [%i4+48],%f18
        faddd   %f24,%f26,%f24
        fmuld   %f34,%f0,%f34
!31
        ldd     [%i3+240],%f26
        faddd   %f20,%f22,%f20
!32
        std     %f20,[%i3+208]
        faddd   %f28,%f30,%f28
        fmuld   %f54,%f4,%f16
!33
        ldd     [%i3+32],%f30
        fmuld   %f12,%f2,%f14
!34
        ldd     [%i4+64],%f22
        faddd   %f32,%f34,%f32
        fmuld   %f18,%f0,%f18
!35
        ldd     [%i3+64],%f34
        faddd   %f24,%f26,%f24
!36
        std     %f24,[%i3+240]
        faddd   %f28,%f30,%f28
        fmuld   %f56,%f4,%f20
!37
        std     %f28,[%i3+32]
        fmuld   %f14,%f8,%f12
!38
        ldd     [%i1+80],%f24
        faddd   %f32,%f34,%f34  ! yes, tmp52!
        fmuld   %f22,%f0,%f22
!39
        ldd     [%i4+80],%f26
        faddd   %f16,%f18,%f16
!40
        ldd     [%i1+96],%f28
        fmuld   %f58,%f4,%f32
!41
        ldd     [%i4+96],%f30
        fdtox   %f12,%f12
        fmuld   %f24,%f4,%f24
!42
        std     %f34,[%i3+64]   ! yes, tmp52!
        faddd   %f20,%f22,%f20
        fmuld   %f26,%f0,%f26
!43
        ldd     [%i3+96],%f18
        fmuld   %f28,%f4,%f28
!44
        ldd     [%i3+128],%f22
        fmovd   %f38,%f4
        fmuld   %f30,%f0,%f30
!45
        fxtod   %f12,%f12
        fmuld   %f60,%f0,%f34
!46
        add     %i3,8,%i3
        faddd   %f24,%f26,%f24
!47
        ldd     [%i3+160-8],%f26
        faddd   %f16,%f18,%f16
!48
        std     %f16,[%i3+96-8]
        faddd   %f28,%f30,%f28
!49
        ldd     [%i3+192-8],%f30
        faddd   %f32,%f34,%f32
        fmuld   %f12,%f10,%f12
!50
        ldd     [%i3+224-8],%f34
        faddd   %f20,%f22,%f20
!51
        std     %f20,[%i3+128-8]
        faddd   %f24,%f26,%f24
!52
        add     %l1,1,%l1
        std     %f24,[%i3+160-8]
        faddd   %f28,%f30,%f28
!53
        cmp     %l1,15
        std     %f28,[%i3+192-8]
        fsubd   %f14,%f12,%f0
!54
        faddd   %f32,%f34,%f32
        ble,pt  %icc,.L99999999
        std     %f32,[%i3+224-8]


!
        ldd     [%g5+%lo(TwoToMinus32)],%f8
!
        ldd     [%i3+8],%f16
!
        ldd     [%i3+16],%f20
!
        fmuld   %f8,%f16,%f18
        ldd     [%i3+24],%f24
!
        fmuld   %f8,%f20,%f22
        ldd     [%i3+32],%f28
!
        fmuld   %f8,%f24,%f26
        ldd     [%l5+%lo(TwoTo32)],%f10
!
        fmuld   %f8,%f28,%f30
!
        fdtox   %f18,%f18
!
        fdtox   %f22,%f22
!
        fdtox   %f26,%f26
        ldd     [%i3+40],%f32
!
        fdtox   %f30,%f30
        ldd     [%i3+48],%f56
!
        fxtod   %f18,%f18
        fmuld   %f8,%f32,%f34
        ldd     [%i3+56],%f36
!
        fxtod   %f22,%f22
        fmuld   %f8,%f56,%f58
        ldd     [%i3+64],%f38
!
        fxtod   %f26,%f26
        fmuld   %f8,%f36,%f60
!
        fxtod   %f30,%f30
        fmuld   %f8,%f38,%f62
!
        fdtox   %f34,%f34
        fmuld   %f10,%f18,%f40
!
        fdtox   %f58,%f58
        fmuld   %f10,%f22,%f42
!
        fdtox   %f60,%f60
        fmuld   %f10,%f26,%f44
!
        fdtox   %f62,%f62
        fmuld   %f10,%f30,%f46
!
        fxtod   %f34,%f34
!
        fxtod   %f58,%f58
!
        fxtod   %f60,%f60
!
        fxtod   %f62,%f62
!
        fsubd   %f16,%f40,%f40
        fmuld   %f10,%f34,%f48
!
        fsubd   %f20,%f42,%f42
        fmuld   %f10,%f58,%f50
!
        fsubd   %f24,%f44,%f44
        fmuld   %f10,%f60,%f52
!
        fsubd   %f28,%f46,%f46
        fmuld   %f10,%f62,%f54
!
        std     %f40,[%i3+8]
!
        std     %f42,[%i3+16]
!
        faddd   %f18,%f44,%f44
        std     %f44,[%i3+24]
!
        faddd   %f22,%f46,%f46
        std     %f46,[%i3+32]
!



        fsubd   %f32,%f48,%f48
        ldd     [%i3+64+8],%f16
!
        fsubd   %f56,%f50,%f50
        ldd     [%i3+64+16],%f20
!
        fsubd   %f36,%f52,%f52
        ldd     [%i3+64+24],%f24
!
        fsubd   %f38,%f54,%f54
        ldd     [%i3+64+32],%f28
!
        faddd   %f26,%f48,%f48
        fmuld   %f8,%f16,%f18
        std     %f48,[%i3+40]
!
        faddd   %f30,%f50,%f50
        fmuld   %f8,%f20,%f22
        std     %f50,[%i3+48]
!
        faddd   %f34,%f52,%f52
        fmuld   %f8,%f24,%f26
        std     %f52,[%i3+56]
!
        faddd   %f58,%f54,%f54
        fmuld   %f8,%f28,%f30
        std     %f54,[%i3+64]
!


        fdtox   %f18,%f18
!
        fdtox   %f22,%f22
!
        fdtox   %f26,%f26
        ldd     [%i3+64+40],%f32
!
        fdtox   %f30,%f30
        ldd     [%i3+64+48],%f56
!
        fxtod   %f18,%f18
        fmuld   %f8,%f32,%f34
        ldd     [%i3+64+56],%f36
!
        fxtod   %f22,%f22
        fmuld   %f8,%f56,%f58
        ldd     [%i3+64+64],%f38
!
        fxtod   %f26,%f26
        fmuld   %f8,%f36,%f12
!
        fxtod   %f30,%f30
        fmuld   %f8,%f38,%f14
!
        fdtox   %f34,%f34
        fmuld   %f10,%f18,%f40
!
        fdtox   %f58,%f58
        fmuld   %f10,%f22,%f42
!
        fdtox   %f12,%f12
        fmuld   %f10,%f26,%f44
!
        fdtox   %f14,%f14
        fmuld   %f10,%f30,%f46
!
        fxtod   %f34,%f34
!
        fxtod   %f58,%f58
!
        fxtod   %f12,%f12
!
        fxtod   %f14,%f14
!
        fsubd   %f16,%f40,%f40
        fmuld   %f10,%f34,%f48
!
        fsubd   %f20,%f42,%f42
        fmuld   %f10,%f58,%f50
!
        fsubd   %f24,%f44,%f44
        fmuld   %f10,%f12,%f52
!
        fsubd   %f28,%f46,%f46
        fmuld   %f10,%f14,%f54
!
        faddd   %f60,%f40,%f40
        std     %f40,[%i3+64+8]
!
        faddd   %f62,%f42,%f42
        std     %f42,[%i3+64+16]
!
        faddd   %f18,%f44,%f44
        std     %f44,[%i3+64+24]
!
        faddd   %f22,%f46,%f46
        std     %f46,[%i3+64+32]
!



        fsubd   %f32,%f48,%f48
        ldd     [%i3+64+64+8],%f16
!
        fsubd   %f56,%f50,%f50
        ldd     [%i3+64+64+16],%f20
!
        fsubd   %f36,%f52,%f52
        ldd     [%i3+64+64+24],%f24
!
        fsubd   %f38,%f54,%f54
        ldd     [%i3+64+64+32],%f28
!
        faddd   %f26,%f48,%f48
        fmuld   %f8,%f16,%f18
        std     %f48,[%i3+64+40]
!
        faddd   %f30,%f50,%f50
        fmuld   %f8,%f20,%f22
        std     %f50,[%i3+64+48]
!
        faddd   %f34,%f52,%f52
        fmuld   %f8,%f24,%f26
        std     %f52,[%i3+64+56]
!
        faddd   %f58,%f54,%f54
        fmuld   %f8,%f28,%f30
        std     %f54,[%i3+64+64]
!



        fdtox   %f18,%f18
!
        fdtox   %f22,%f22
!
        fdtox   %f26,%f26
        ldd     [%i3+64+64+40],%f32
!
        fdtox   %f30,%f30
        ldd     [%i3+64+64+48],%f56
!
        fxtod   %f18,%f18
        fmuld   %f8,%f32,%f34
        ldd     [%i3+64+64+56],%f36
!
        fxtod   %f22,%f22
        fmuld   %f8,%f56,%f58
        ldd     [%i3+64+64+64],%f38
!
        fxtod   %f26,%f26
        fmuld   %f8,%f36,%f60
!
        fxtod   %f30,%f30
        fmuld   %f8,%f38,%f62
!
        fdtox   %f34,%f34
        fmuld   %f10,%f18,%f40
!
        fdtox   %f58,%f58
        fmuld   %f10,%f22,%f42
!
        fdtox   %f60,%f60
        fmuld   %f10,%f26,%f44
!
        fdtox   %f62,%f62
        fmuld   %f10,%f30,%f46
!
        fxtod   %f34,%f34
!
        fxtod   %f58,%f58
!
        fxtod   %f60,%f60
!
        fxtod   %f62,%f62
!
        fsubd   %f16,%f40,%f40
        fmuld   %f10,%f34,%f48
!
        fsubd   %f20,%f42,%f42
        fmuld   %f10,%f58,%f50
!
        fsubd   %f24,%f44,%f44
        fmuld   %f10,%f60,%f52
!
        fsubd   %f28,%f46,%f46
        fmuld   %f10,%f62,%f54
!
        faddd   %f12,%f40,%f40
        std     %f40,[%i3+64+64+8]
!
        faddd   %f14,%f42,%f42
        std     %f42,[%i3+64+64+16]
!
        faddd   %f18,%f44,%f44
        std     %f44,[%i3+64+64+24]
!
        faddd   %f22,%f46,%f46
        std     %f46,[%i3+64+64+32]
!


        fsubd   %f32,%f48,%f48
        ldd     [%i3+64+64+64+8],%f16
!
        fsubd   %f56,%f50,%f50
        ldd     [%i3+64+64+64+16],%f20
!
        fsubd   %f36,%f52,%f52
        ldd     [%i3+64+64+64+24],%f24
!
        fsubd   %f38,%f54,%f54
        ldd     [%i3+64+64+64+32],%f28
!
        faddd   %f26,%f48,%f48
        fmuld   %f8,%f16,%f18
        std     %f48,[%i3+64+64+40]
!
        faddd   %f30,%f50,%f50
        fmuld   %f8,%f20,%f22
        std     %f50,[%i3+64+64+48]
!
        faddd   %f34,%f52,%f52
        fmuld   %f8,%f24,%f26
        std     %f52,[%i3+64+64+56]
!
        faddd   %f58,%f54,%f54
        fmuld   %f8,%f28,%f30
        std     %f54,[%i3+64+64+64]
!


        fdtox   %f18,%f18
!
        fdtox   %f22,%f22
!
        fdtox   %f26,%f26
        ldd     [%i3+64+64+64+40],%f32
!
        fdtox   %f30,%f30
        ldd     [%i3+64+64+64+48],%f56
!
        fxtod   %f18,%f18
        fmuld   %f8,%f32,%f34
        ldd     [%i3+64+64+64+56],%f36
!
        fxtod   %f22,%f22
        fmuld   %f8,%f56,%f58
        ldd     [%i3+64+64+64+64],%f38
!
        fxtod   %f26,%f26
        fmuld   %f8,%f36,%f12
!
        fxtod   %f30,%f30
        fmuld   %f8,%f38,%f14
!
        fdtox   %f34,%f34
        fmuld   %f10,%f18,%f40
!
        fdtox   %f58,%f58
        fmuld   %f10,%f22,%f42
!
        fdtox   %f12,%f12
        fmuld   %f10,%f26,%f44
!
        fdtox   %f14,%f14
        fmuld   %f10,%f30,%f46
!
        sethi   %hi(TwoToMinus16),%g5
        fxtod   %f34,%f34
!
        sethi   %hi(TwoTo16),%l5
        fxtod   %f58,%f58
!
        fxtod   %f12,%f12
!
        fxtod   %f14,%f14
!
        fsubd   %f16,%f40,%f16
        fmuld   %f10,%f34,%f48
        ldd     [%g5+%lo(TwoToMinus16)],%f8
!
        fsubd   %f20,%f42,%f20
        fmuld   %f10,%f58,%f50
        ldd     [%i1],%f40      ! should be %f40
!
        fsubd   %f24,%f44,%f24
        fmuld   %f10,%f12,%f52
        ldd     [%i1+8],%f42    ! should be %f42
!
        fsubd   %f28,%f46,%f28
        fmuld   %f10,%f14,%f54
        ldd     [%i4],%f44      ! should be %f44
!
        faddd   %f60,%f16,%f16
        std     %f16,[%i3+64+64+64+8]
!
        faddd   %f62,%f20,%f20
        std     %f20,[%i3+64+64+64+16]
!
        faddd   %f18,%f24,%f24
        std     %f24,[%i3+64+64+64+24]
!
        faddd   %f22,%f28,%f28
        std     %f28,[%i3+64+64+64+32]
!
        fsubd   %f32,%f48,%f32
        ldd     [%i4+8],%f46     ! should be %f46
!
        fsubd   %f56,%f50,%f56
        ldd     [%i1+104],%f48  ! should be %f48
!
        fsubd   %f36,%f52,%f36
        ldd     [%i4+104],%f50  ! should be %f50
!
        fsubd   %f38,%f54,%f38
        ldd     [%i1+16],%f52   ! should be %f52
!
        faddd   %f26,%f32,%f32
        std     %f32,[%i3+64+64+64+40]
!
        faddd   %f30,%f56,%f56
        std     %f56,[%i3+64+64+64+48]
!
        faddd   %f34,%f36,%f36
        std     %f36,[%i3+64+64+64+56]
!
        faddd   %f58,%f38,%f38
        std     %f38,[%i3+64+64+64+64]
!
        std     %f12,[%i3+64+64+64+64+8]
!
        std     %f14,[%i3+64+64+64+64+16]
!

        ldd     [%l5+%lo(TwoTo16)],%f10
        ldd     [%i1+48],%f54
        ldd     [%i1+56],%f36
        ldd     [%i1+64],%f56
        ldd     [%i1+112],%f58

        ldd     [%i4+104],%f50
        ldd     [%i4+112],%f60


        .L99999998:
!1
        ldd     [%i1+24],%f20
        fmuld   %f0,%f44,%f12
!2
        ldd     [%i4+24],%f22
        fmuld   %f42,%f4,%f16
!3
        ldd     [%i1+40],%f24
        fmuld   %f46,%f0,%f18
!4
        ldd     [%i4+40],%f26
        fmuld   %f20,%f4,%f20
!5
        ldd     [%l0+8],%f38
        faddd   %f12,%f6,%f12
        fmuld   %f22,%f0,%f22
!6
        add     %l0,8,%l0
        ldd     [%i4+56],%f30
        fmuld   %f24,%f4,%f24
!7
        ldd     [%i1+72],%f32
        faddd   %f16,%f18,%f16
        fmuld   %f26,%f0,%f26
!8
        ldd     [%i3+16],%f18
        fmuld   %f40,%f38,%f14
!9
        ldd     [%i4+72],%f34
        faddd   %f20,%f22,%f20
        fmuld   %f8,%f12,%f12
!10
        ldd     [%i3+48],%f22
        fmuld   %f36,%f4,%f28
!11
        ldd     [%i3+8],%f6
        faddd   %f16,%f18,%f16
        fmuld   %f30,%f0,%f30
!12
        std     %f16,[%i3+16]
        faddd   %f24,%f26,%f24
        fmuld   %f32,%f4,%f32
!13
        ldd     [%i3+80],%f26
        faddd   %f12,%f14,%f12
        fmuld   %f34,%f0,%f34
!14
        ldd     [%i1+88],%f16
        faddd   %f20,%f22,%f20
!15
        ldd     [%i4+88],%f18
        faddd   %f28,%f30,%f28
!16
        ldd     [%i3+112],%f30
        faddd   %f32,%f34,%f32
!17
        ldd     [%i3+144],%f34
        faddd   %f12,%f6,%f6
        fmuld   %f16,%f4,%f16
!18
        std     %f20,[%i3+48]
        faddd   %f24,%f26,%f24
        fmuld   %f18,%f0,%f18
!19
        std     %f24,[%i3+80]
        faddd   %f28,%f30,%f28
        fmuld   %f48,%f4,%f20
!20
        std     %f28,[%i3+112]
        faddd   %f32,%f34,%f32
        fmuld   %f50,%f0,%f22
!21
        ldd     [%i1+120],%f24
        fdtox   %f6,%f12
!22
        std     %f32,[%i3+144]
        faddd   %f16,%f18,%f16
!23
        ldd     [%i4+120],%f26
!24
        ldd     [%i3+176],%f18
        faddd   %f20,%f22,%f20
        fmuld   %f24,%f4,%f24
!25
        ldd     [%i4+16],%f30
        fmovs   %f11,%f12
!26
        ldd     [%i1+32],%f32
        fmuld   %f26,%f0,%f26
!27
        ldd     [%i4+32],%f34
        fmuld   %f52,%f4,%f28
!28
        ldd     [%i3+208],%f22
        faddd   %f16,%f18,%f16
        fmuld   %f30,%f0,%f30
!29
        std     %f16,[%i3+176]
        fxtod   %f12,%f12
        fmuld   %f32,%f4,%f32
!30
        ldd     [%i4+48],%f18
        faddd   %f24,%f26,%f24
        fmuld   %f34,%f0,%f34
!31
        ldd     [%i3+240],%f26
        faddd   %f20,%f22,%f20
!32
        std     %f20,[%i3+208]
        faddd   %f28,%f30,%f28
        fmuld   %f54,%f4,%f16
!33
        ldd     [%i3+32],%f30
        fmuld   %f12,%f2,%f14
!34
        ldd     [%i4+64],%f22
        faddd   %f32,%f34,%f32
        fmuld   %f18,%f0,%f18
!35
        ldd     [%i3+64],%f34
        faddd   %f24,%f26,%f24
!36
        std     %f24,[%i3+240]
        faddd   %f28,%f30,%f28
        fmuld   %f56,%f4,%f20
!37
        std     %f28,[%i3+32]
        fmuld   %f14,%f8,%f12
!38
        ldd     [%i1+80],%f24
        faddd   %f32,%f34,%f34  !       yes, tmp52!
        fmuld   %f22,%f0,%f22
!39
        ldd     [%i4+80],%f26
        faddd   %f16,%f18,%f16
!40
        ldd     [%i1+96],%f28
        fmuld   %f58,%f4,%f32
!41
        ldd     [%i4+96],%f30
        fdtox   %f12,%f12
        fmuld   %f24,%f4,%f24
!42
        std     %f34,[%i3+64]   ! yes, tmp52!
        faddd   %f20,%f22,%f20
        fmuld   %f26,%f0,%f26
!43
        ldd     [%i3+96],%f18
        fmuld   %f28,%f4,%f28
!44
        ldd     [%i3+128],%f22
        fmovd   %f38,%f4
        fmuld   %f30,%f0,%f30
!45
        fxtod   %f12,%f12
        fmuld   %f60,%f0,%f34
!46
        add     %i3,8,%i3
        faddd   %f24,%f26,%f24
!47
        ldd     [%i3+160-8],%f26
        faddd   %f16,%f18,%f16
!48
        std     %f16,[%i3+96-8]
        faddd   %f28,%f30,%f28
!49
        ldd     [%i3+192-8],%f30
        faddd   %f32,%f34,%f32
        fmuld   %f12,%f10,%f12
!50
        ldd     [%i3+224-8],%f34
        faddd   %f20,%f22,%f20
!51
        std     %f20,[%i3+128-8]
        faddd   %f24,%f26,%f24
!52
        add     %l1,1,%l1
        std     %f24,[%i3+160-8]
        faddd   %f28,%f30,%f28
!53
        cmp     %l1,31
        std     %f28,[%i3+192-8]
        fsubd   %f14,%f12,%f0
!54
        faddd   %f32,%f34,%f32
        ble,pt  %icc,.L99999998
        std     %f32,[%i3+224-8]
!55
        std     %f6,[%i3]

        add     %o5,%g0,%i3


!END HAND CODED PART
                       .L900000828:
/* 0x03e4        405 */         ba      .L900000852
/* 0x03e8        409 */         ldx     [%i3+%o0],%l1

!  406                !         }
!  407                ! }
!  409                ! conv_d16_to_i32(result, dt + 2 * nlen, (int64_t *)dt, nlen + 1);
!  411                !/*for(i=0;i<nlen+1;i++) saveresult[i]=result[i];*/
!  413                ! adjust_montf_result(result, nint, nlen);

                       .L77000476:
/* 0x03ec        413 */         sll     %g1,2,%l3
/* 0x03f0          0 */         sethi   %hi(TwoTo16),%g5
/* 0x03f4        413 */         add     %l3,2,%l2
/* 0x03f8        328 */         cmp     %l2,0
/* 0x03fc            */         ble,pn  %icc,.L77000482
/* 0x0400          0 */         sethi   %hi(TwoToMinus16),%o2
                       .L77000514:
/* 0x0404        329 */         add     %l3,2,%l2
/* 0x0408        328 */         add     %l3,1,%o4
/* 0x040c            */         or      %g0,0,%l3
/* 0x0410        329 */         cmp     %l2,8
/* 0x0414            */         bl,pn   %icc,.L77000477
/* 0x0418        328 */         or      %g0,%i3,%l1
                       .L900000831:
/* 0x041c        329 */         prefetch        [%i3],22
/* 0x0420            */         sub     %o4,7,%l4
/* 0x0424            */         or      %g0,0,%l3
/* 0x0428            */         or      %g0,%i3,%l1
                       .L900000829:
/* 0x042c        329 */         prefetch        [%l1+528],22
/* 0x0430            */         std     %f0,[%l1]
/* 0x0434            */         add     %l3,8,%l3
/* 0x0438            */         add     %l1,64,%l1
/* 0x043c            */         std     %f0,[%l1-56]
/* 0x0440            */         cmp     %l3,%l4
/* 0x0444            */         std     %f0,[%l1-48]
/* 0x0448            */         std     %f0,[%l1-40]
/* 0x044c            */         prefetch        [%l1+496],22
/* 0x0450            */         std     %f0,[%l1-32]
/* 0x0454            */         std     %f0,[%l1-24]
/* 0x0458            */         std     %f0,[%l1-16]
/* 0x045c            */         ble,pt  %icc,.L900000829
/* 0x0460            */         std     %f0,[%l1-8]
                       .L900000832:
/* 0x0464        329 */         cmp     %l3,%o4
/* 0x0468            */         bg,pn   %icc,.L77000482
/* 0x046c            */         nop
                       .L77000477:
/* 0x0470        329 */         add     %l3,1,%l3
                       .L900000851:
/* 0x0474        329 */         std     %f0,[%l1]
/* 0x0478            */         cmp     %l3,%o4
/* 0x047c            */         add     %l1,8,%l1
/* 0x0480            */         ble,pt  %icc,.L900000851
/* 0x0484            */         add     %l3,1,%l3
                       .L77000482:
/* 0x0488        330 */         ldd     [%i1],%f40
/* 0x048c        334 */         cmp     %o3,0
/* 0x0490            */         sub     %g1,1,%l3
/* 0x0494        330 */         ldd     [%l0],%f42
/* 0x0498        331 */         ldd     [%o2+%lo(TwoToMinus16)],%f36
/* 0x049c            */         ldd     [%g5+%lo(TwoTo16)],%f38
/* 0x04a0        330 */         fmuld   %f40,%f42,%f52
/* 0x04a4        331 */         fdtox   %f52,%f8
/* 0x04a8            */         fmovs   %f0,%f8
/* 0x04ac            */         fxtod   %f8,%f62
/* 0x04b0            */         fmuld   %f62,%f14,%f60
/* 0x04b4            */         fmuld   %f60,%f36,%f32
/* 0x04b8            */         fdtox   %f32,%f50
/* 0x04bc            */         fxtod   %f50,%f34
/* 0x04c0            */         fmuld   %f34,%f38,%f46
/* 0x04c4            */         fsubd   %f60,%f46,%f40
/* 0x04c8        334 */         ble,pn  %icc,.L77000378
/* 0x04cc        330 */         std     %f52,[%i3]
                       .L77000509:
/* 0x04d0        345 */         add     %o3,1,%g5
/* 0x04d4            */         sll     %g5,1,%o2
/* 0x04d8            */         or      %g0,0,%l1
/* 0x04dc        337 */         ldd     [%i4],%f42
/* 0x04e0        345 */         sub     %o3,1,%o3
/* 0x04e4            */         or      %g0,0,%o5
/* 0x04e8            */         or      %g0,%i3,%l2
/* 0x04ec            */         add     %i4,8,%o1
/* 0x04f0            */         add     %i1,8,%g5
                       .L900000848:
/* 0x04f4        337 */         fmuld   %f40,%f42,%f34
/* 0x04f8            */         ldd     [%l0+8],%f32
/* 0x04fc        341 */         cmp     %g1,1
/* 0x0500        337 */         ldd     [%i1],%f50
/* 0x0504            */         ldd     [%l2],%f46
/* 0x0508            */         ldd     [%l2+8],%f44
/* 0x050c            */         fmuld   %f50,%f32,%f60
/* 0x0510        335 */         ldd     [%l0],%f42
/* 0x0514        337 */         faddd   %f46,%f34,%f48
/* 0x0518            */         faddd   %f44,%f60,%f58
/* 0x051c            */         fmuld   %f36,%f48,%f54
/* 0x0520            */         faddd   %f58,%f54,%f34
/* 0x0524        341 */         ble,pn  %icc,.L77000368
/* 0x0528        338 */         std     %f34,[%l2+8]
                       .L77000507:
/* 0x052c        341 */         or      %g0,1,%l5
/* 0x0530            */         or      %g0,2,%l4
/* 0x0534            */         or      %g0,%g5,%g4
/* 0x0538        342 */         cmp     %l3,12
/* 0x053c            */         bl,pn   %icc,.L77000481
/* 0x0540        341 */         or      %g0,%o1,%g3
                       .L900000839:
/* 0x0544        342 */         prefetch        [%i1+8],0
/* 0x0548            */         prefetch        [%i1+72],0
/* 0x054c            */         add     %i4,40,%l6
/* 0x0550            */         add     %i1,40,%l7
/* 0x0554            */         prefetch        [%l2+16],0
/* 0x0558            */         or      %g0,%l2,%o7
/* 0x055c            */         sub     %l3,7,%i5
/* 0x0560            */         prefetch        [%l2+80],0
/* 0x0564            */         add     %l2,80,%g2
/* 0x0568            */         or      %g0,2,%l4
/* 0x056c            */         prefetch        [%i1+136],0
/* 0x0570            */         or      %g0,5,%l5
/* 0x0574            */         prefetch        [%i1+200],0
/* 0x0578            */         prefetch        [%l2+144],0
/* 0x057c            */         ldd     [%i4+8],%f52
/* 0x0580            */         ldd     [%i4+16],%f44
/* 0x0584            */         ldd     [%i4+24],%f56
/* 0x0588            */         fmuld   %f40,%f52,%f48
/* 0x058c            */         fmuld   %f40,%f44,%f46
/* 0x0590            */         fmuld   %f40,%f56,%f44
/* 0x0594            */         ldd     [%l2+48],%f56
/* 0x0598            */         prefetch        [%l2+208],0
/* 0x059c            */         prefetch        [%l2+272],0
/* 0x05a0            */         prefetch        [%l2+336],0
/* 0x05a4            */         prefetch        [%l2+400],0
/* 0x05a8            */         ldd     [%i1+8],%f32
/* 0x05ac            */         ldd     [%i1+16],%f60
/* 0x05b0            */         ldd     [%i1+24],%f50
/* 0x05b4            */         fmuld   %f42,%f32,%f62
/* 0x05b8            */         ldd     [%i1+32],%f32
/* 0x05bc            */         fmuld   %f42,%f60,%f58
/* 0x05c0            */         ldd     [%l2+16],%f52
/* 0x05c4            */         ldd     [%l2+32],%f54
/* 0x05c8            */         faddd   %f62,%f48,%f60
/* 0x05cc            */         fmuld   %f42,%f50,%f48
/* 0x05d0            */         faddd   %f58,%f46,%f62
/* 0x05d4            */         ldd     [%i4+32],%f46
/* 0x05d8            */         ldd     [%l2+64],%f58
                       .L900000837:
/* 0x05dc        342 */         prefetch        [%l7+192],0
/* 0x05e0            */         fmuld   %f40,%f46,%f46
/* 0x05e4            */         faddd   %f60,%f52,%f60
/* 0x05e8            */         ldd     [%l6],%f52
/* 0x05ec            */         std     %f60,[%g2-64]
/* 0x05f0            */         fmuld   %f42,%f32,%f50
/* 0x05f4            */         add     %l5,8,%l5
/* 0x05f8            */         ldd     [%l7],%f60
/* 0x05fc            */         faddd   %f48,%f44,%f48
/* 0x0600            */         cmp     %l5,%i5
/* 0x0604            */         ldd     [%g2],%f32
/* 0x0608            */         add     %g2,128,%g2
/* 0x060c            */         prefetch        [%g2+256],0
/* 0x0610            */         fmuld   %f40,%f52,%f52
/* 0x0614            */         faddd   %f62,%f54,%f44
/* 0x0618            */         ldd     [%l6+8],%f54
/* 0x061c            */         std     %f44,[%g2-176]
/* 0x0620            */         fmuld   %f42,%f60,%f44
/* 0x0624            */         add     %l6,64,%l6
/* 0x0628            */         ldd     [%l7+8],%f60
/* 0x062c            */         faddd   %f50,%f46,%f50
/* 0x0630            */         add     %l7,64,%l7
/* 0x0634            */         add     %l4,16,%l4
/* 0x0638            */         ldd     [%g2-112],%f46
/* 0x063c            */         fmuld   %f40,%f54,%f54
/* 0x0640            */         faddd   %f48,%f56,%f62
/* 0x0644            */         ldd     [%l6-48],%f56
/* 0x0648            */         std     %f62,[%g2-160]
/* 0x064c            */         fmuld   %f42,%f60,%f48
/* 0x0650            */         ldd     [%l7-48],%f60
/* 0x0654            */         faddd   %f44,%f52,%f52
/* 0x0658            */         ldd     [%g2-96],%f30
/* 0x065c            */         prefetch        [%g2+288],0
/* 0x0660            */         fmuld   %f40,%f56,%f56
/* 0x0664            */         faddd   %f50,%f58,%f62
/* 0x0668            */         ldd     [%l6-40],%f58
/* 0x066c            */         std     %f62,[%g2-144]
/* 0x0670            */         fmuld   %f42,%f60,%f50
/* 0x0674            */         ldd     [%l7-40],%f62
/* 0x0678            */         faddd   %f48,%f54,%f54
/* 0x067c            */         ldd     [%g2-80],%f28
/* 0x0680            */         prefetch        [%l7+160],0
/* 0x0684            */         fmuld   %f40,%f58,%f48
/* 0x0688            */         faddd   %f52,%f32,%f44
/* 0x068c            */         ldd     [%l6-32],%f58
/* 0x0690            */         std     %f44,[%g2-128]
/* 0x0694            */         fmuld   %f42,%f62,%f44
/* 0x0698            */         ldd     [%l7-32],%f60
/* 0x069c            */         faddd   %f50,%f56,%f56
/* 0x06a0            */         ldd     [%g2-64],%f52
/* 0x06a4            */         prefetch        [%g2+320],0
/* 0x06a8            */         fmuld   %f40,%f58,%f50
/* 0x06ac            */         faddd   %f54,%f46,%f32
/* 0x06b0            */         ldd     [%l6-24],%f62
/* 0x06b4            */         std     %f32,[%g2-112]
/* 0x06b8            */         fmuld   %f42,%f60,%f46
/* 0x06bc            */         ldd     [%l7-24],%f60
/* 0x06c0            */         faddd   %f44,%f48,%f48
/* 0x06c4            */         ldd     [%g2-48],%f54
/* 0x06c8            */         fmuld   %f40,%f62,%f26
/* 0x06cc            */         faddd   %f56,%f30,%f32
/* 0x06d0            */         ldd     [%l6-16],%f58
/* 0x06d4            */         std     %f32,[%g2-96]
/* 0x06d8            */         fmuld   %f42,%f60,%f30
/* 0x06dc            */         ldd     [%l7-16],%f32
/* 0x06e0            */         faddd   %f46,%f50,%f60
/* 0x06e4            */         ldd     [%g2-32],%f56
/* 0x06e8            */         prefetch        [%g2+352],0
/* 0x06ec            */         fmuld   %f40,%f58,%f44
/* 0x06f0            */         faddd   %f48,%f28,%f62
/* 0x06f4            */         ldd     [%l6-8],%f46
/* 0x06f8            */         std     %f62,[%g2-80]
/* 0x06fc            */         fmuld   %f42,%f32,%f48
/* 0x0700            */         ldd     [%l7-8],%f32
/* 0x0704            */         faddd   %f30,%f26,%f62
/* 0x0708            */         ble,pt  %icc,.L900000837
/* 0x070c            */         ldd     [%g2-16],%f58
                       .L900000840:
/* 0x0710        342 */         fmuld   %f40,%f46,%f46
/* 0x0714            */         faddd   %f62,%f54,%f62
/* 0x0718            */         std     %f62,[%g2-48]
/* 0x071c            */         cmp     %l5,%l3
/* 0x0720            */         fmuld   %f42,%f32,%f50
/* 0x0724            */         faddd   %f48,%f44,%f48
/* 0x0728            */         or      %g0,%l7,%g4
/* 0x072c            */         or      %g0,%l6,%g3
/* 0x0730            */         faddd   %f60,%f52,%f60
/* 0x0734            */         std     %f60,[%g2-64]
/* 0x0738            */         or      %g0,%o7,%l2
/* 0x073c            */         add     %l4,8,%l4
/* 0x0740            */         faddd   %f50,%f46,%f54
/* 0x0744            */         faddd   %f48,%f56,%f56
/* 0x0748            */         std     %f56,[%g2-32]
/* 0x074c            */         faddd   %f54,%f58,%f58
/* 0x0750            */         bg,pn   %icc,.L77000368
/* 0x0754            */         std     %f58,[%g2-16]
                       .L77000481:
/* 0x0758        342 */         ldd     [%g4],%f44
                       .L900000850:
/* 0x075c        342 */         ldd     [%g3],%f48
/* 0x0760            */         fmuld   %f42,%f44,%f58
/* 0x0764            */         sra     %l4,0,%l7
/* 0x0768            */         add     %l5,1,%l5
/* 0x076c            */         sllx    %l7,3,%g2
/* 0x0770            */         add     %g4,8,%g4
/* 0x0774            */         ldd     [%l2+%g2],%f56
/* 0x0778            */         cmp     %l5,%l3
/* 0x077c            */         add     %l4,2,%l4
/* 0x0780            */         fmuld   %f40,%f48,%f54
/* 0x0784            */         add     %g3,8,%g3
/* 0x0788            */         faddd   %f58,%f54,%f52
/* 0x078c            */         faddd   %f52,%f56,%f62
/* 0x0790            */         std     %f62,[%l2+%g2]
/* 0x0794            */         ble,a,pt        %icc,.L900000850
/* 0x0798            */         ldd     [%g4],%f44
                       .L77000368:
/* 0x079c        344 */         cmp     %o5,15
/* 0x07a0            */         bne,pn  %icc,.L77000483
/* 0x07a4        345 */         srl     %l1,31,%g4
                       .L77000478:
/* 0x07a8        345 */         add     %l1,%g4,%l4
/* 0x07ac            */         sra     %l4,1,%o7
/* 0x07b0            */         add     %o7,1,%o4
/* 0x07b4            */         sll     %o4,1,%l6
/* 0x07b8            */         cmp     %l6,%o2
/* 0x07bc            */         bge,pn  %icc,.L77000392
/* 0x07c0            */         fmovd   %f0,%f42
                       .L77000508:
/* 0x07c4        345 */         sra     %l6,0,%l4
/* 0x07c8            */         sllx    %l4,3,%g2
/* 0x07cc            */         fmovd   %f0,%f32
/* 0x07d0            */         sub     %o2,1,%l5
/* 0x07d4            */         ldd     [%g2+%i3],%f40
/* 0x07d8            */         add     %g2,%i3,%g3
                       .L900000849:
/* 0x07dc        345 */         fdtox   %f40,%f10
/* 0x07e0            */         ldd     [%g3+8],%f52
/* 0x07e4            */         add     %l6,2,%l6
/* 0x07e8            */         cmp     %l6,%l5
/* 0x07ec            */         fdtox   %f52,%f2
/* 0x07f0            */         fmovd   %f10,%f30
/* 0x07f4            */         fmovs   %f0,%f10
/* 0x07f8            */         fmovs   %f0,%f2
/* 0x07fc            */         fxtod   %f10,%f10
/* 0x0800            */         fxtod   %f2,%f2
/* 0x0804            */         fdtox   %f52,%f28
/* 0x0808            */         faddd   %f10,%f32,%f56
/* 0x080c            */         std     %f56,[%g3]
/* 0x0810            */         faddd   %f2,%f42,%f62
/* 0x0814            */         std     %f62,[%g3+8]
/* 0x0818            */         fitod   %f30,%f32
/* 0x081c            */         add     %g3,16,%g3
/* 0x0820            */         fitod   %f28,%f42
/* 0x0824            */         ble,a,pt        %icc,.L900000849
/* 0x0828            */         ldd     [%g3],%f40
                       .L77000392:
/* 0x082c        346 */         or      %g0,0,%o5
                       .L77000483:
/* 0x0830        350 */         fdtox   %f34,%f6
/* 0x0834            */         add     %l1,1,%l1
/* 0x0838            */         cmp     %l1,%o3
/* 0x083c            */         add     %o5,1,%o5
/* 0x0840            */         add     %l2,8,%l2
/* 0x0844            */         add     %l0,8,%l0
/* 0x0848            */         fmovs   %f0,%f6
/* 0x084c            */         fxtod   %f6,%f46
/* 0x0850            */         fmuld   %f46,%f14,%f56
/* 0x0854            */         fmuld   %f56,%f36,%f44
/* 0x0858            */         fdtox   %f44,%f48
/* 0x085c            */         fxtod   %f48,%f58
/* 0x0860            */         fmuld   %f58,%f38,%f54
/* 0x0864            */         fsubd   %f56,%f54,%f40
/* 0x0868            */         ble,a,pt        %icc,.L900000848
/* 0x086c        337 */         ldd     [%i4],%f42
                       .L77000378:
/* 0x0870        409 */         ldx     [%i3+%o0],%l1
                       .L900000852:
/* 0x0874        409 */         add     %i3,%o0,%l4
/* 0x0878            */         ldx     [%l4+8],%i1
/* 0x087c            */         cmp     %l1,0
/* 0x0880            */         bne,pn  %xcc,.L77000403
/* 0x0884            */         or      %g0,0,%g5
                       .L77000402:
/* 0x0888        409 */         or      %g0,0,%i3
/* 0x088c            */         ba      .L900000847
/* 0x0890            */         cmp     %i1,0
                       .L77000403:
/* 0x0894        409 */         srlx    %l1,52,%o5
/* 0x0898            */         sethi   %hi(0xfff00000),%i3
/* 0x089c            */         sllx    %i3,32,%o2
/* 0x08a0            */         sethi   %hi(0x40000000),%o0
/* 0x08a4            */         sllx    %o0,22,%o4
/* 0x08a8            */         or      %g0,1023,%l0
/* 0x08ac            */         xor     %o2,-1,%o3
/* 0x08b0            */         sub     %l0,%o5,%o7
/* 0x08b4            */         and     %l1,%o3,%l1
/* 0x08b8            */         add     %o7,52,%i4
/* 0x08bc            */         or      %l1,%o4,%o1
/* 0x08c0            */         cmp     %i1,0
/* 0x08c4            */         srlx    %o1,%i4,%i3
                       .L900000847:
/* 0x08c8        409 */         bne,pn  %xcc,.L77000409
/* 0x08cc            */         or      %g0,0,%o7
                       .L77000408:
/* 0x08d0        409 */         ba      .L900000846
/* 0x08d4        350 */         cmp     %g1,0
                       .L77000409:
/* 0x08d8        409 */         srlx    %i1,52,%l2
/* 0x08dc            */         sethi   %hi(0xfff00000),%o7
/* 0x08e0            */         sllx    %o7,32,%i4
/* 0x08e4            */         sethi   %hi(0x40000000),%i5
/* 0x08e8            */         sllx    %i5,22,%l6
/* 0x08ec            */         or      %g0,1023,%l5
/* 0x08f0            */         xor     %i4,-1,%o1
/* 0x08f4            */         sub     %l5,%l2,%g2
/* 0x08f8            */         and     %i1,%o1,%l7
/* 0x08fc            */         add     %g2,52,%g3
/* 0x0900            */         or      %l7,%l6,%g4
/* 0x0904        350 */         cmp     %g1,0
/* 0x0908        409 */         srlx    %g4,%g3,%o7
                       .L900000846:
/* 0x090c        350 */         ble,pn  %icc,.L77000397
/* 0x0910            */         or      %g0,0,%l5
                       .L77000510:
/* 0x0914        409 */         sethi   %hi(0xfff00000),%g4
/* 0x0918            */         sllx    %g4,32,%o0
/* 0x091c          0 */         or      %g0,-1,%i5
/* 0x0920        409 */         srl     %i5,0,%l7
/* 0x0924            */         sethi   %hi(0x40000000),%i1
/* 0x0928            */         sllx    %i1,22,%l6
/* 0x092c            */         sethi   %hi(0xfc00),%i4
/* 0x0930            */         xor     %o0,-1,%g2
/* 0x0934            */         add     %i4,1023,%l2
/* 0x0938            */         or      %g0,2,%g4
/* 0x093c            */         or      %g0,%i2,%g3
                       .L77000395:
/* 0x0940        409 */         sra     %g4,0,%o2
/* 0x0944            */         add     %g4,1,%o3
/* 0x0948            */         sllx    %o2,3,%o0
/* 0x094c            */         sra     %o3,0,%o5
/* 0x0950            */         ldx     [%l4+%o0],%o4
/* 0x0954            */         sllx    %o5,3,%l0
/* 0x0958            */         and     %i3,%l7,%o1
/* 0x095c            */         ldx     [%l4+%l0],%i4
/* 0x0960            */         cmp     %o4,0
/* 0x0964            */         bne,pn  %xcc,.L77000415
/* 0x0968        350 */         and     %o7,%l2,%i5
                       .L77000414:
/* 0x096c        409 */         or      %g0,0,%l1
/* 0x0970            */         ba      .L900000845
/* 0x0974            */         add     %g5,%o1,%i1
                       .L77000415:
/* 0x0978        409 */         srlx    %o4,52,%o3
/* 0x097c            */         and     %o4,%g2,%l1
/* 0x0980            */         or      %g0,52,%o0
/* 0x0984            */         sub     %o3,1023,%l0
/* 0x0988            */         or      %l1,%l6,%o4
/* 0x098c            */         sub     %o0,%l0,%o5
/* 0x0990            */         srlx    %o4,%o5,%l1
/* 0x0994            */         add     %g5,%o1,%i1
                       .L900000845:
/* 0x0998        409 */         srax    %i3,32,%g5
/* 0x099c            */         cmp     %i4,0
/* 0x09a0            */         bne,pn  %xcc,.L77000421
/* 0x09a4        350 */         sllx    %i5,16,%o2
                       .L77000420:
/* 0x09a8        409 */         or      %g0,0,%o4
/* 0x09ac            */         ba      .L900000844
/* 0x09b0        350 */         add     %i1,%o2,%o5
                       .L77000421:
/* 0x09b4        409 */         srlx    %i4,52,%o4
/* 0x09b8            */         or      %g0,52,%o0
/* 0x09bc            */         sub     %o4,1023,%o3
/* 0x09c0            */         and     %i4,%g2,%i3
/* 0x09c4            */         or      %i3,%l6,%o5
/* 0x09c8            */         sub     %o0,%o3,%l0
/* 0x09cc            */         srlx    %o5,%l0,%o4
/* 0x09d0        350 */         add     %i1,%o2,%o5
                       .L900000844:
/* 0x09d4        350 */         srax    %o7,16,%i4
/* 0x09d8            */         srax    %o5,32,%i5
/* 0x09dc            */         add     %i4,%i5,%o1
/* 0x09e0            */         add     %l5,1,%l5
/* 0x09e4            */         and     %o5,%l7,%i1
/* 0x09e8            */         add     %g5,%o1,%g5
/* 0x09ec            */         st      %i1,[%g3]
/* 0x09f0            */         or      %g0,%l1,%i3
/* 0x09f4            */         or      %g0,%o4,%o7
/* 0x09f8            */         add     %g4,2,%g4
/* 0x09fc            */         cmp     %l5,%l3
/* 0x0a00            */         ble,pt  %icc,.L77000395
/* 0x0a04            */         add     %g3,4,%g3
                       .L77000397:
/* 0x0a08        409 */         sethi   %hi(0xfc00),%l4
/* 0x0a0c            */         sra     %l5,0,%i5
/* 0x0a10            */         add     %l4,1023,%i1
/* 0x0a14            */         add     %g5,%i3,%l5
/* 0x0a18            */         and     %o7,%i1,%g5
/* 0x0a1c            */         sllx    %g5,16,%l2
/* 0x0a20            */         sllx    %i5,2,%l7
/* 0x0a24        413 */         sra     %g1,0,%g2
/* 0x0a28        409 */         add     %l5,%l2,%l6
/* 0x0a2c            */         st      %l6,[%i2+%l7]
/* 0x0a30        413 */         sllx    %g2,2,%g3
/* 0x0a34            */         ld      [%i2+%g3],%g4
/* 0x0a38            */         cmp     %g4,0
/* 0x0a3c            */         bgu,pn  %icc,.L77000486
/* 0x0a40            */         cmp     %l3,0
                       .L77000427:
/* 0x0a44        413 */         bl,pn   %icc,.L77000486
/* 0x0a48            */         or      %g0,%l3,%i5
                       .L77000512:
/* 0x0a4c        413 */         sra     %l3,0,%o5
/* 0x0a50            */         sllx    %o5,2,%l7
/* 0x0a54            */         ld      [%l7+%i0],%o5
/* 0x0a58            */         add     %l7,%i2,%o1
/* 0x0a5c            */         add     %l7,%i0,%i4
                       .L900000843:
/* 0x0a60        413 */         ld      [%o1],%i1
/* 0x0a64            */         cmp     %i1,%o5
/* 0x0a68            */         bne,pn  %icc,.L77000435
/* 0x0a6c            */         sub     %o1,4,%o1
                       .L77000431:
/* 0x0a70        413 */         sub     %i4,4,%i4
/* 0x0a74            */         subcc   %i5,1,%i5
/* 0x0a78            */         bpos,a,pt       %icc,.L900000843
/* 0x0a7c            */         ld      [%i4],%o5
                       .L900000827:
/* 0x0a80        413 */         ba      .L900000842
/* 0x0a84        350 */         cmp     %g1,0
                       .L77000435:
/* 0x0a88        413 */         sra     %i5,0,%o0
/* 0x0a8c            */         sllx    %o0,2,%l1
/* 0x0a90            */         ld      [%i0+%l1],%i3
/* 0x0a94            */         ld      [%i2+%l1],%l0
/* 0x0a98            */         cmp     %l0,%i3
/* 0x0a9c            */         bleu,pt %icc,.L77000379
/* 0x0aa0            */         nop
                       .L77000486:
/* 0x0aa4        350 */         cmp     %g1,0
                       .L900000842:
/* 0x0aa8        350 */         ble,pn  %icc,.L77000379
/* 0x0aac            */         add     %l3,1,%g3
                       .L77000511:
/* 0x0ab0        350 */         or      %g0,0,%l5
/* 0x0ab4            */         cmp     %g3,10
/* 0x0ab8            */         bl,pn   %icc,.L77000487
/* 0x0abc            */         or      %g0,0,%g1
                       .L900000835:
/* 0x0ac0        350 */         prefetch        [%i2],22
/* 0x0ac4            */         add     %i0,4,%l2
/* 0x0ac8            */         prefetch        [%i2+64],22
/* 0x0acc            */         add     %i2,8,%o5
/* 0x0ad0            */         sub     %l3,7,%i0
/* 0x0ad4            */         prefetch        [%i2+128],22
/* 0x0ad8            */         or      %g0,2,%l5
/* 0x0adc            */         prefetch        [%i2+192],22
/* 0x0ae0            */         prefetch        [%i2+256],22
/* 0x0ae4            */         prefetch        [%i2+320],22
/* 0x0ae8            */         prefetch        [%i2+384],22
/* 0x0aec            */         ld      [%l2-4],%l7
/* 0x0af0            */         ld      [%o5-4],%l6
/* 0x0af4            */         prefetch        [%o5+440],22
/* 0x0af8            */         prefetch        [%o5+504],22
/* 0x0afc            */         ld      [%i2],%i2
/* 0x0b00            */         sub     %i2,%l7,%g3
/* 0x0b04            */         st      %g3,[%o5-8]
/* 0x0b08            */         srax    %g3,32,%l7
                       .L900000833:
/* 0x0b0c        350 */         add     %l5,8,%l5
/* 0x0b10            */         add     %o5,32,%o5
/* 0x0b14            */         ld      [%l2],%i5
/* 0x0b18            */         prefetch        [%o5+496],22
/* 0x0b1c            */         cmp     %l5,%i0
/* 0x0b20            */         add     %l2,32,%l2
/* 0x0b24            */         sub     %l6,%i5,%g5
/* 0x0b28            */         add     %g5,%l7,%o0
/* 0x0b2c            */         ld      [%o5-32],%l4
/* 0x0b30            */         st      %o0,[%o5-36]
/* 0x0b34            */         srax    %o0,32,%i3
/* 0x0b38            */         ld      [%l2-28],%i1
/* 0x0b3c            */         sub     %l4,%i1,%i4
/* 0x0b40            */         add     %i4,%i3,%o1
/* 0x0b44            */         ld      [%o5-28],%o3
/* 0x0b48            */         st      %o1,[%o5-32]
/* 0x0b4c            */         srax    %o1,32,%l1
/* 0x0b50            */         ld      [%l2-24],%o2
/* 0x0b54            */         sub     %o3,%o2,%g2
/* 0x0b58            */         add     %g2,%l1,%o7
/* 0x0b5c            */         ld      [%o5-24],%l0
/* 0x0b60            */         st      %o7,[%o5-28]
/* 0x0b64            */         srax    %o7,32,%l6
/* 0x0b68            */         ld      [%l2-20],%o4
/* 0x0b6c            */         sub     %l0,%o4,%g1
/* 0x0b70            */         add     %g1,%l6,%l7
/* 0x0b74            */         ld      [%o5-20],%i2
/* 0x0b78            */         st      %l7,[%o5-24]
/* 0x0b7c            */         srax    %l7,32,%g4
/* 0x0b80            */         ld      [%l2-16],%g3
/* 0x0b84            */         sub     %i2,%g3,%i5
/* 0x0b88            */         add     %i5,%g4,%g5
/* 0x0b8c            */         ld      [%o5-16],%i1
/* 0x0b90            */         st      %g5,[%o5-20]
/* 0x0b94            */         srax    %g5,32,%l4
/* 0x0b98            */         ld      [%l2-12],%o0
/* 0x0b9c            */         sub     %i1,%o0,%i3
/* 0x0ba0            */         add     %i3,%l4,%i4
/* 0x0ba4            */         ld      [%o5-12],%o2
/* 0x0ba8            */         st      %i4,[%o5-16]
/* 0x0bac            */         srax    %i4,32,%o3
/* 0x0bb0            */         ld      [%l2-8],%o1
/* 0x0bb4            */         sub     %o2,%o1,%l1
/* 0x0bb8            */         add     %l1,%o3,%g2
/* 0x0bbc            */         ld      [%o5-8],%o4
/* 0x0bc0            */         st      %g2,[%o5-12]
/* 0x0bc4            */         srax    %g2,32,%l0
/* 0x0bc8            */         ld      [%l2-4],%o7
/* 0x0bcc            */         sub     %o4,%o7,%l6
/* 0x0bd0            */         add     %l6,%l0,%g1
/* 0x0bd4            */         ld      [%o5-4],%l6
/* 0x0bd8            */         st      %g1,[%o5-8]
/* 0x0bdc            */         ble,pt  %icc,.L900000833
/* 0x0be0            */         srax    %g1,32,%l7
                       .L900000836:
/* 0x0be4        350 */         ld      [%l2],%l0
/* 0x0be8            */         add     %l2,4,%i0
/* 0x0bec            */         or      %g0,%o5,%i2
/* 0x0bf0            */         cmp     %l5,%l3
/* 0x0bf4            */         sub     %l6,%l0,%l6
/* 0x0bf8            */         add     %l6,%l7,%g1
/* 0x0bfc            */         st      %g1,[%o5-4]
/* 0x0c00            */         bg,pn   %icc,.L77000379
/* 0x0c04            */         srax    %g1,32,%g1
                       .L77000487:
/* 0x0c08        350 */         ld      [%i2],%o4
                       .L900000841:
/* 0x0c0c        350 */         ld      [%i0],%i3
/* 0x0c10            */         add     %g1,%o4,%l0
/* 0x0c14            */         add     %l5,1,%l5
/* 0x0c18            */         cmp     %l5,%l3
/* 0x0c1c            */         add     %i0,4,%i0
/* 0x0c20            */         sub     %l0,%i3,%l6
/* 0x0c24            */         st      %l6,[%i2]
/* 0x0c28            */         srax    %l6,32,%g1
/* 0x0c2c            */         add     %i2,4,%i2
/* 0x0c30            */         ble,a,pt        %icc,.L900000841
/* 0x0c34            */         ld      [%i2],%o4
                       .L77000379:
/* 0x0c38        405 */         ret     ! Result =
/* 0x0c3c            */         restore %g0,%g0,%g0
/* 0x0c40          0 */         .type   mont_mulf_noconv,2
/* 0x0c40          0 */         .size   mont_mulf_noconv,(.-mont_mulf_noconv)

! Begin Disassembling Debug Info
        .xstabs ".stab.index","V=10.0;DBG_GEN=4.14.14;cd;backend;Xa;O;R=Sun C 5.5 Patch 112760-07 2004/02/03",60,0,0,0
        .xstabs ".stab.index","/workspace/ferenc/algorithms/bignum/unified/mont_mulf; /ws/onnv-tools/SUNWspro/SOS8/prod/bin/cc -D_KERNEL -DRF_INLINE_MACROS -fast -xarch=v9 -xO5 -xstrconst -xdepend -Xa -xchip=ultra3 -xcode=abs32 -Wc,-Qrm-Qd -Wc,-Qrm-Qf -Wc,-assembly -V -W0,-xp -c conv_v9.il -o mont_mulf.o  mont_mulf.c",52,0,0,0

! End Disassembling Debug Info

! Begin Disassembling Ident
        .ident  "cg: Sun Compiler Common 7.1 Patch 112763-10 2004/01/27"        ! (NO SOURCE LINE)
        .ident  "@(#)mont_mulf.c\t1.2\t01/09/24 SMI"    ! (/tmp/acompAAApja4Fx:8)
        .ident  "@(#)types.h\t1.74\t03/08/07 SMI"       ! (/tmp/acompAAApja4Fx:9)
        .ident  "@(#)isa_defs.h\t1.20\t99/05/04 SMI"    ! (/tmp/acompAAApja4Fx:10)
        .ident  "@(#)feature_tests.h\t1.18\t99/07/26 SMI"       ! (/tmp/acompAAApja4Fx:11)
        .ident  "@(#)machtypes.h\t1.13\t99/05/04 SMI"   ! (/tmp/acompAAApja4Fx:12)
        .ident  "@(#)inttypes.h\t1.2\t98/01/16 SMI"     ! (/tmp/acompAAApja4Fx:13)
        .ident  "@(#)int_types.h\t1.6\t97/08/20 SMI"    ! (/tmp/acompAAApja4Fx:14)
        .ident  "@(#)int_limits.h\t1.6\t99/08/06 SMI"   ! (/tmp/acompAAApja4Fx:15)
        .ident  "@(#)int_const.h\t1.2\t96/07/08 SMI"    ! (/tmp/acompAAApja4Fx:16)
        .ident  "@(#)int_fmtio.h\t1.2\t96/07/08 SMI"    ! (/tmp/acompAAApja4Fx:17)
        .ident  "@(#)types32.h\t1.4\t98/02/13 SMI"      ! (/tmp/acompAAApja4Fx:18)
        .ident  "@(#)select.h\t1.17\t01/08/15 SMI"      ! (/tmp/acompAAApja4Fx:19)
        .ident  "@(#)math.h\t2.11\t00/09/07 SMI"        ! (/tmp/acompAAApja4Fx:20)
        .ident  "@(#)math_iso.h\t1.2\t00/09/07 SMI"     ! (/tmp/acompAAApja4Fx:21)
        .ident  "@(#)floatingpoint.h\t2.5\t99/06/22 SMI"        ! (/tmp/acompAAApja4Fx:22)
        .ident  "@(#)stdio_tag.h\t1.3\t98/04/20 SMI"    ! (/tmp/acompAAApja4Fx:23)
        .ident  "@(#)ieeefp.h\t2.8 99/10/29"    ! (/tmp/acompAAApja4Fx:24)
        .ident  "acomp: Sun C 5.5 Patch 112760-07 2004/02/03"   ! (/tmp/acompAAApja4Fx:57)
        .ident  "iropt: Sun Compiler Common 7.1 Patch 112763-10 2004/01/27"     ! (/tmp/acompAAApja4Fx:58)
        .ident  "cg: Sun Compiler Common 7.1 Patch 112763-10 2004/01/27"        ! (NO SOURCE LINE)
! End Disassembling Ident

#define FZERO                           \
        fzero   %f0                     ;\
        fzero   %f2                     ;\
        faddd   %f0, %f2, %f4           ;\
        fmuld   %f0, %f2, %f6           ;\
        faddd   %f0, %f2, %f8           ;\
        fmuld   %f0, %f2, %f10          ;\
        faddd   %f0, %f2, %f12          ;\
        fmuld   %f0, %f2, %f14          ;\
        faddd   %f0, %f2, %f16          ;\
        fmuld   %f0, %f2, %f18          ;\
        faddd   %f0, %f2, %f20          ;\
        fmuld   %f0, %f2, %f22          ;\
        faddd   %f0, %f2, %f24          ;\
        fmuld   %f0, %f2, %f26          ;\
        faddd   %f0, %f2, %f28          ;\
        fmuld   %f0, %f2, %f30          ;\
        faddd   %f0, %f2, %f32          ;\
        fmuld   %f0, %f2, %f34          ;\
        faddd   %f0, %f2, %f36          ;\
        fmuld   %f0, %f2, %f38          ;\
        faddd   %f0, %f2, %f40          ;\
        fmuld   %f0, %f2, %f42          ;\
        faddd   %f0, %f2, %f44          ;\
        fmuld   %f0, %f2, %f46          ;\
        faddd   %f0, %f2, %f48          ;\
        fmuld   %f0, %f2, %f50          ;\
        faddd   %f0, %f2, %f52          ;\
        fmuld   %f0, %f2, %f54          ;\
        faddd   %f0, %f2, %f56          ;\
        fmuld   %f0, %f2, %f58          ;\
        faddd   %f0, %f2, %f60          ;\
        fmuld   %f0, %f2, %f62

#include "assym.h"

/*
 * In the routine below, we check/set FPRS_FEF bit since
 * we don't want to take a fp_disabled trap. We need not
 * check/set PSTATE_PEF bit as it is done early during boot.
 */
        ENTRY(big_savefp)
        rd      %fprs, %o2
        st      %o2, [%o0 + FPU_FPRS]
        andcc   %o2, FPRS_FEF, %g0              ! is FPRS_FEF set?
        bnz,a,pt        %icc, .fregs_save       ! yes, go to save
        nop
        wr      %g0, FPRS_FEF, %fprs            ! else, set the bit
        stx     %fsr, [%o0 + FPU_FSR]   ! store %fsr
        retl
        nop
.fregs_save:
        BSTORE_FPREGS(%o0, %o4)
        stx     %fsr, [%o0 + FPU_FSR]   ! store %fsr
        retl
        nop
        SET_SIZE(big_savefp)


        ENTRY(big_restorefp)
        ldx     [%o0 + FPU_FSR], %fsr   ! restore %fsr
        ld      [%o0 + FPU_FPRS], %o1
        andcc   %o1, FPRS_FEF, %g0      ! is FPRS_FEF set in saved %fprs?
        bnz,pt  %icc, .fregs_restore    ! yes, go to restore
        nop
        FZERO                           ! zero out to avoid leaks
        wr      %g0, 0, %fprs
        retl
        nop
.fregs_restore:
        BLOAD_FPREGS(%o0, %o2)
        wr      %o1, 0, %fprs
        retl
        nop
        SET_SIZE(big_restorefp)