root/sys/arch/powerpc/powerpc/vecast.S
/*      $OpenBSD: vecast.S,v 1.1 2022/10/22 00:58:56 gkoehler Exp $     */

/*
 * Copyright (c) 2022 George Koehler <gkoehler@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <machine/asm.h>
#include <machine/psl.h>

/*
 * To load or store an arbitrary AltiVec register, we extract its
 * number from the instruction and multiply it by 8.  We do both using
 * rlwinm to rotate it left into bits 24 to 28.
 *
 * 0        10   15   20  24  28
 * |         |    |    |   |   |
 * 000100dddddaaaaabbbbbcccccxxxxxx
 */
#define VD_ROTATE       14, 24, 28
#define VA_ROTATE       19, 24, 28
#define VB_ROTATE       24, 24, 28
#define VC_ROTATE       29, 24, 28

/*
 * vctuxs, vctsxs have an unsigned immediate UI in bits 11 to 15.  We
 * extract it into bits 4 to 8, then add FLOAT_1_IS to make 2**UI.
 */
#define UI_ROTATE       7, 4, 8
#define FLOAT_1_IS      0x3f80          /* (float 1) >> 16 */

        .rodata
        .balign 4
.Lzero:         .float  0
.Lone:          .float  1
.Ln126:         .float  126
.Ltwo63:        .float  0x1p63
.Ltwo126:       .float  0x1p126
.Lmin:          .float  0x1p-126        /* FLT_MIN */   

        .text

/* This is the stack frame for vecast_asm. */
#define s_size          128
#define s_f31           120
#define s_f30           112
#define s_f29           104
#define s_f28           96
#define s_f27           88
#define s_f26           80
#define s_f25           72
#define s_f24           64
#define s_vc            48
#define s_vb            32
#define s_va            16

/*
 * vecast_asm(insn r3, label r4) emulates an AltiVec instruction when
 * it traps a denormal or subnormal float (with an AltiVec assist
 * exception).  Such a float f has 0 < |f| < FLT_MIN 2**-126.
 *
 * MPC7450 RISC Microprocessor Family Reference Manual, 7.1.2.5 Java
 * Mode, NaNs, Denormalized Numbers, and Zeros, has a list of trapping
 * instructions: vaddfp, vsubfp, vmaddfp, vnmsubfp, vrefp, vrsqrtefp,
 * vlogefp, vexptefp, vctsxs, vctuxs.
 */
ENTRY(vecast_asm)
        mflr    %r0                     /* r0 = return address */
        RETGUARD_SETUP_LATE(vecast_asm, %r9, %r0)
        stwu    %r1, -s_size(%r1)
        mfmsr   %r5                     /* r5 = old msr */

        /*
         * Borrow the vector and floating-point units.  We must
         * preserve all float and most vector registers.
         */
        rlwinm  %r6, %r5, 0, 17, 15     /* r6 = r5 & ~PSL_EE */
        oris    %r6, %r6, PSL_VEC >> 16
        ori     %r6, %r6, PSL_FP
        mtmsr   %r6
        isync

        stfd    %f31, s_f31(%r1)
        stfd    %f30, s_f30(%r1)
        stfd    %f29, s_f29(%r1)
        stfd    %f28, s_f28(%r1)
        stfd    %f27, s_f27(%r1)
        stfd    %f26, s_f26(%r1)
        stfd    %f25, s_f25(%r1)
        stfd    %f24, s_f24(%r1)
        mffs    %f31                    /* f31 = old fpscr */

        lis     %r6, .Lzero@ha
        la      %r6, .Lzero@l(%r6)      /* r6 = address of .Lzero */

        /* fpscr = zero (round to nearest, no traps) */
        lfs     %f30, 0(%r6)            /* f30 = zero */
        mtfsf   255, %f30

        /* All instructions do s_vb = VB now; VD = s_va at finish. */
        rlwinm  %r7, %r3, VB_ROTATE
        la      %r8, s_vb(%r1)
        bl      vecast_store_vector

        mtctr   %r4
        li      %r4, 4          /* r4 = 4 loop iterations */
        bctr                    /* Branch to our instruction's label. */

/*
 * vaddfp: d = a + b
 */
        .globl vecast_vaddfp
vecast_vaddfp:
        rlwinm  %r7, %r3, VA_ROTATE
        la      %r8, s_va(%r1)
        bl      vecast_store_vector

        /* s_va = s_va + s_vb */
        mtctr   %r4
        la      %r7, (s_va - 4)(%r1)
1:      lfsu    %f30, 4(%r7)            /* r7 += 4, then load (r7). */
        lfs     %f29, (s_vb - s_va)(%r7)
        fadds   %f30, %f30, %f29
        stfs    %f30, 0(%r7)
        bdnz    1b                      /* Loop 4 times. */
        b       vecast_finish

/*
 * vsubfp: d = a + b
 */
        .globl vecast_vsubfp
vecast_vsubfp:
        rlwinm  %r7, %r3, VA_ROTATE
        la      %r8, s_va(%r1)
        bl      vecast_store_vector

        /* s_va = s_va - s_vb */
        mtctr   %r4
        la      %r7, (s_va - 4)(%r1)
1:      lfsu    %f30, 4(%r7)
        lfs     %f29, (s_vb - s_va)(%r7)
        fsubs   %f30, %f30, %f29
        stfs    %f30, 0(%r7)
        bdnz    1b
        b       vecast_finish

/*
 * vmaddfp: d = a * c + b
 */
        .globl vecast_vmaddfp
vecast_vmaddfp:
        rlwinm  %r7, %r3, VA_ROTATE
        la      %r8, s_va(%r1)
        bl      vecast_store_vector
        rlwinm  %r7, %r3, VC_ROTATE
        la      %r8, s_vc(%r1)
        bl      vecast_store_vector

        /* s_va = s_va * s_vc + s_vb */
        mtctr   %r4
        la      %r7, (s_va - 4)(%r1)
1:      lfsu    %f30, 4(%r7)
        lfs     %f29, (s_vb - s_va)(%r7)
        lfs     %f28, (s_vc - s_va)(%r7)
        fmadds  %f30, %f30, %f28, %f29
        stfs    %f30, 0(%r7)
        bdnz    1b
        b       vecast_finish

/*
 * vnmsubfp: d = b - a * c
 */
        .globl vecast_vnmsubfp
vecast_vnmsubfp:
        rlwinm  %r7, %r3, VA_ROTATE
        la      %r8, s_va(%r1)
        bl      vecast_store_vector
        rlwinm  %r7, %r3, VC_ROTATE
        la      %r8, s_vc(%r1)
        bl      vecast_store_vector

        /* s_va = -(s_va * s_vc - s_vb) */
        mtctr   %r4
        la      %r7, (s_va - 4)(%r1)
1:      lfsu    %f30, 4(%r7)
        lfs     %f29, (s_vb - s_va)(%r7)
        lfs     %f28, (s_vc - s_va)(%r7)
        fnmsubs %f30, %f30, %f28, %f29
        stfs    %f30, 0(%r7)
        bdnz    1b
        b       vecast_finish

/*
 * vrefp: d = estimate 1 / b
 */
        .globl vecast_vrefp
vecast_vrefp:
        /* s_va = estimate 1 / s_vb */
        mtctr   %r4
        la      %r7, (s_vb - 4)(%r1)
1:      lfsu    %f30, 4(%r7)
        fres    %f30, %f30
        stfs    %f30, (s_va - s_vb)(%r7)
        bdnz    1b
        b       vecast_finish

/*
 * vrsqrtefp: d = estimate 1 / sqrt(b)
 * 1 / sqrt(b) = 1 / sqrt(b * 2**126) * 2**63 when b < 2**-126
 *
 * MPC7455's frsqrte does 1 / sqrt(1) = 0.984375, relative error 1/64.
 * AltiVec must not err over 1/4096, so avoid frsqrte.
 */
        .globl vecast_vrsqrtefp
vecast_vrsqrtefp:
        /* f30 = 1; f29 = 2**63, f28 = 2**126; f27 = 2**-126 */
        lfs     %f30, (.Lone - .Lzero)(%r6)
        lfs     %f29, (.Ltwo63 - .Lzero)(%r6)
        lfs     %f28, (.Ltwo126 - .Lzero)(%r6)
        lfs     %f27, (.Lmin - .Lzero)(%r6)

        /*
         * s_vb = s_vb * 2**126, s_va = 2**63 when b < 2**-126
         * s_va = 1 when b >= 2**-126
         */
        mtctr   %r4
        la      %r7, (s_vb - 4)(%r1)
1:      lfsu    %f26, 4(%r7)
        fmuls   %f25, %f26, %f28
        fsubs   %f24, %f26, %f27        /* f24 selects b >= 2**-126 */
        fsel    %f26, %f24, %f26, %f25  /* f26 = b or b * 2**126 */
        stfs    %f26, 0(%r7)
        fsel    %f25, %f24, %f30, %f29  /* f25 = 1 or 2**63 */
        stfs    %f25, (s_va - s_vb)(%r7)
        bdnz    1b

        /* s_vb = estimate 1 / sqrt(s_vb) */
        la      %r7, s_vc(%r1)
        la      %r8, s_vb(%r1)
        stvx    %v31, 0, %r7            /* Save v31 in s_vc. */
        lvx     %v31, 0, %r8
        vrsqrtefp %v31, %v31
        stvx    %v31, 0, %r8
        lvx     %v31, 0, %r7

        /* s_va = s_vb * s_va */
        mtctr   %r4
        la      %r7, (s_va - 4)(%r1)
1:      lfsu    %f30, 4(%r7)
        lfs     %f29, (s_vb - s_va)(%r7)
        fmuls   %f30, %f29, %f30
        stfs    %f30, 0(%r7)
        bdnz    1b
        b       vecast_finish

/*
 * vlogefp: d = estimate log2(b)
 * log2(b) = log2(b * 2**126) - 126 when b < 2**-126
 */
        .globl  vecast_vlogefp
vecast_vlogefp:
        /* f30 = 0; f29 = 126; f28 = 2**126; f27 = 2**-126 */
        lfs     %f29, (.Ln126 - .Lzero)(%r6)
        lfs     %f28, (.Ltwo126 - .Lzero)(%r6)
        lfs     %f27, (.Lmin - .Lzero)(%r6)

        /*
         * s_vb = s_vb * 2**126, s_va = 126 when s_vb < 2**-126
         * s_va = 0 when s_vb >= 2**-126
         */
        mtctr   %r4
        la      %r7, (s_vb - 4)(%r1)
1:      lfsu    %f26, 4(%r7)
        fmuls   %f25, %f26, %f28
        fsubs   %f24, %f26, %f27        /* f24 selects b >= 2**-126 */
        fsel    %f26, %f24, %f26, %f25  /* f26 = b or b * 2**126 */
        stfs    %f26, 0(%r7)
        fsel    %f25, %f24, %f30, %f29  /* f25 = 0 or 126 */
        stfs    %f25, (s_va - s_vb)(%r7)
        bdnz    1b

        /* s_vb = estimate log2(s_vb) */
        la      %r7, s_vc(%r1)
        la      %r8, s_vb(%r1)
        stvx    %v31, 0, %r7
        lvx     %v31, 0, %r8
        vlogefp %v31, %v31
        stvx    %v31, 0, %r8
        lvx     %v31, 0, %r7

        /* s_va = s_vb - s_va */
        mtctr   %r4
        la      %r7, (s_va - 4)(%r1)
1:      lfsu    %f30, 4(%r7)
        lfs     %f29, (s_vb - s_va)(%r7)
        fsubs   %f30, %f29, %f30
        stfs    %f30, 0(%r7)
        bdnz    1b
        b       vecast_finish

/*
 * vexptefp: d = estimate 2**b
 * 2**b = 2**(b + 126) * 2**-126 when -252 <= b < -126
 */
        .globl  vecast_vexptefp
vecast_vexptefp:
        /* f30 = 1; f29 = 126; f28 = 2**-126 */
        lfs     %f30, (.Lone - .Lzero)(%r6)
        lfs     %f29, (.Ln126 - .Lzero)(%r6)
        lfs     %f28, (.Lmin - .Lzero)(%r6)

        /*
         * s_vb = s_vb + 126 when -252 <= b < -126
         * s_va = 2**-126 when b < -126
         * s_va = 1 when b >= -126
         *
         * If b < -252, we avoid a possibly subnormal 2**(b + 126)
         * by calculating 2**b * 2**-126 = 0 * 2**-126 = 0.
         */
        mtctr   %r4
        la      %r7, (s_vb - 4)(%r1)
1:      lfsu    %f27, 4(%r7)
        fadds   %f26, %f27, %f29        /* f26 selects b >= -126 */
        fadds   %f25, %f26, %f29        /* f25 selects b >= -252 */
        fsel    %f24, %f26, %f27, %f26
        fsel    %f24, %f25, %f24, %f27  /* f24 = b or b + 126 */
        stfs    %f24, 0(%r7)
        fsel    %f27, %f26, %f30, %f28  /* f27 = 1 or 2**-126 */
        stfs    %f27, (s_va - s_vb)(%r7)
        bdnz    1b

        /* s_vb = estimate 2**s_vb */
        la      %r7, s_vc(%r1)
        la      %r8, s_vb(%r1)
        stvx    %v31, 0, %r7
        lvx     %v31, 0, %r8
        vexptefp %v31, %v31
        stvx    %v31, 0, %r8
        lvx     %v31, 0, %r7

        /* s_va = s_vb * s_va */
        mtctr   %r4
        la      %r7, (s_va - 4)(%r1)
1:      lfsu    %f30, 4(%r7)
        lfs     %f29, (s_vb - s_va)(%r7)
        fmuls   %f30, %f29, %f30
        stfs    %f30, 0(%r7)
        bdnz    1b
        b       vecast_finish

/*
 * vctsxs: d = (int32_t)(b * 2**u) where 0 <= u < 32
 * d = 0 when |b| < 2**-126
 */
        .globl  vecast_vctsxs
vecast_vctsxs:
        /* f30 = 0; f29 = 2**-126; f28 = 2**u */
        lfs     %f29, (.Lmin - .Lzero)(%r6)
        rlwinm  %r7, %r3, UI_ROTATE
        addis   %r7, %r7, FLOAT_1_IS
        stw     %r7, s_va(%r1)
        lfs     %f28, s_va(%r1)

        /* s_va = s_vb * 2**u, unless b is tiny. */
        mtctr   %r4
        la      %r7, (s_vb - 4)(%r1)
1:      lfsu    %f27, 4(%r7)
        fmuls   %f26, %f27, %f28
        fabs    %f27, %f27
        fsubs   %f27, %f27, %f29        /* f27 selects |b| >= 2**-126 */
        fsel    %f26, %f27, %f26, %f30  /* f26 = b * 2**u or 0 */
        stfs    %f26, (s_va - s_vb)(%r7)
        bdnz    1b

        /* s_va = (int32_t)b */
        la      %r7, s_vc(%r1)
        la      %r8, s_va(%r1)
        stvx    %v31, 0, %r7
        lvx     %v31, 0, %r8
        vctsxs  %v31, %v31, 0           /* May set SAT in vscr. */
        stvx    %v31, 0, %r8
        lvx     %v31, 0, %r7
        b       vecast_finish

/*
 * vctuxs: d = (uint32_t)(b * 2**u) where 0 <= u < 32
 * d = 0 when |b| < 2**-126
 */
        .globl  vecast_vctuxs
vecast_vctuxs:
        /* f30 = 0; f29 = 2**-126; f28 = 2**u */
        lfs     %f29, (.Lmin - .Lzero)(%r6)
        rlwinm  %r7, %r3, UI_ROTATE
        addis   %r7, %r7, FLOAT_1_IS
        stw     %r7, s_va(%r1)
        lfs     %f28, s_va(%r1)

        /* s_va = s_vb * 2**u, unless b is tiny. */
        mtctr   %r4
        la      %r7, (s_vb - 4)(%r1)
1:      lfsu    %f27, 4(%r7)
        fmuls   %f26, %f27, %f28
        fabs    %f27, %f27
        fsubs   %f27, %f27, %f29        /* f27 selects |b| >= 2**-126 */
        fsel    %f26, %f27, %f26, %f30  /* f26 = b * 2**u or 0 */
        stfs    %f26, (s_va - s_vb)(%r7)
        bdnz    1b

        /* s_va = (uint32_t)b */
        la      %r7, s_vc(%r1)
        la      %r8, s_va(%r1)
        stvx    %v31, 0, %r7
        lvx     %v31, 0, %r8
        vctuxs  %v31, %v31, 0           /* May set SAT in vscr. */
        stvx    %v31, 0, %r8
        lvx     %v31, 0, %r7
        /* b    vecast_finish */

vecast_finish:
        /* VD = s_va */
        rlwinm  %r7, %r3, VD_ROTATE
        addis   %r7, %r7, 1f@ha
        addi    %r7, %r7, 1f@l
        mtctr   %r7
        la      %r8, s_va(%r1)
        bctr
#define M(n) lvx %v##n, 0, %r8; b 2f
1:      M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7)
        M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15)
        M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23)
        M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31)
#undef M
2:      mtlr    %r0
        mtfsf   255, %f31               /* Restore old fpscr. */
        lfd     %f24, s_f24(%r1)
        lfd     %f25, s_f25(%r1)
        lfd     %f26, s_f26(%r1)
        lfd     %f27, s_f27(%r1)
        lfd     %f28, s_f28(%r1)
        lfd     %f29, s_f29(%r1)
        lfd     %f30, s_f30(%r1)
        lfd     %f31, s_f31(%r1)
        mtmsr   %r5                     /* Restore old msr. */
        isync
        addi    %r1, %r1, s_size
        RETGUARD_CHECK(vecast_asm, %r9, %r0)
        blr

/*
 * Stores vector v(r7 / 8) to address r8.
 */
vecast_store_vector:
        RETGUARD_SETUP(vecast_store_vector, %r11, %r12)
        addis   %r7, %r7, 1f@ha
        addi    %r7, %r7, 1f@l
        mtctr   %r7
        bctr
#define M(n)    stvx    %v##n, 0, %r8; b 2f
1:      M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7)
        M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15)
        M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23)
        M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31)
#undef M
2:      RETGUARD_CHECK(vecast_store_vector, %r11, %r12)
        blr