root/usr/src/uts/sun4/os/visinstr.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/* VIS floating point instruction simulator for Sparc FPU simulator. */

#include <sys/types.h>
#include <sys/systm.h>
#include <sys/fpu/fpusystm.h>
#include <sys/fpu/fpu_simulator.h>
#include <sys/vis_simulator.h>
#include <sys/fpu/globals.h>
#include <sys/privregs.h>
#include <sys/sun4asi.h>
#include <sys/machasi.h>
#include <sys/debug.h>
#include <sys/cpu_module.h>
#include <sys/systm.h>

#define FPU_REG_FIELD uint32_reg        /* Coordinate with FPU_REGS_TYPE. */
#define FPU_DREG_FIELD uint64_reg       /* Coordinate with FPU_DREGS_TYPE. */
#define FPU_FSR_FIELD uint64_reg        /* Coordinate with V9_FPU_FSR_TYPE. */

extern  uint_t  get_subcc_ccr(uint64_t, uint64_t);

static enum ftt_type vis_array(fp_simd_type *, vis_inst_type, struct regs *,
                                void *);
static enum ftt_type vis_alignaddr(fp_simd_type *, vis_inst_type,
                                struct regs *, void *, kfpu_t *);
static enum ftt_type vis_edge(fp_simd_type *, vis_inst_type, struct regs *,
                                void *);
static enum ftt_type vis_faligndata(fp_simd_type *, fp_inst_type,
                                kfpu_t *);
static enum ftt_type vis_bmask(fp_simd_type *, vis_inst_type, struct regs *,
                                void *, kfpu_t *);
static enum ftt_type vis_bshuffle(fp_simd_type *, fp_inst_type,
                                kfpu_t *);
static enum ftt_type vis_siam(fp_simd_type *, vis_inst_type, kfpu_t *);
static enum ftt_type vis_fcmp(fp_simd_type *, vis_inst_type, struct regs *,
                                void *);
static enum ftt_type vis_fmul(fp_simd_type *, vis_inst_type);
static enum ftt_type vis_fpixel(fp_simd_type *, vis_inst_type, kfpu_t *);
static enum ftt_type vis_fpaddsub(fp_simd_type *, vis_inst_type);
static enum ftt_type vis_pdist(fp_simd_type *, fp_inst_type, struct regs *,
                                void *, uint_t);
static enum ftt_type vis_prtl_fst(fp_simd_type *, vis_inst_type, struct regs *,
                                void *, uint_t);
static enum ftt_type vis_short_fls(fp_simd_type *, vis_inst_type,
                                struct regs *, void *, uint_t);
static enum ftt_type vis_blk_fldst(fp_simd_type *, vis_inst_type,
                                struct regs *, void *, uint_t);

/*
 * Simulator for VIS instructions with op3 == 0x36 that get fp_disabled
 * traps.
 */
enum ftt_type
vis_fpu_simulator(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        fp_inst_type    pinst,  /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        kfpu_t          *fp)    /* Need to fp to access gsr reg */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        uint_t  us1, us2, usr;
        uint64_t lus1, lus2, lusr;
        enum ftt_type ftt = ftt_none;
        union {
                vis_inst_type   inst;
                fp_inst_type    pinst;
        } f;

        ASSERT(USERMODE(pregs->r_tstate));
        nrs1 = pinst.rs1;
        nrs2 = pinst.rs2;
        nrd = pinst.rd;
        f.pinst = pinst;
        if ((f.inst.opf & 1) == 0) {            /* double precision */
                if ((nrs1 & 1) == 1)            /* fix register encoding */
                        nrs1 = (nrs1 & 0x1e) | 0x20;
                if ((nrs2 & 1) == 1)
                        nrs2 = (nrs2 & 0x1e) | 0x20;
                if ((nrd & 1) == 1)
                        nrd = (nrd & 0x1e) | 0x20;
        }

        switch (f.inst.opf) {
                /* these instr's do not use fp regs */
        case edge8:
        case edge8l:
        case edge8n:
        case edge8ln:
        case edge16:
        case edge16l:
        case edge16n:
        case edge16ln:
        case edge32:
        case edge32l:
        case edge32n:
        case edge32ln:
                ftt = vis_edge(pfpsd, f.inst, pregs, prw);
                break;
        case array8:
        case array16:
        case array32:
                ftt = vis_array(pfpsd, f.inst, pregs, prw);
                break;
        case alignaddr:
        case alignaddrl:
                ftt = vis_alignaddr(pfpsd, f.inst, pregs, prw, fp);
                break;
        case bmask:
                ftt = vis_bmask(pfpsd, f.inst, pregs, prw, fp);
                break;
        case fcmple16:
        case fcmpne16:
        case fcmpgt16:
        case fcmpeq16:
        case fcmple32:
        case fcmpne32:
        case fcmpgt32:
        case fcmpeq32:
                ftt = vis_fcmp(pfpsd, f.inst, pregs, prw);
                break;
        case fmul8x16:
        case fmul8x16au:
        case fmul8x16al:
        case fmul8sux16:
        case fmul8ulx16:
        case fmuld8sux16:
        case fmuld8ulx16:
                ftt = vis_fmul(pfpsd, f.inst);
                break;
        case fpack16:
        case fpack32:
        case fpackfix:
        case fexpand:
        case fpmerge:
                ftt = vis_fpixel(pfpsd, f.inst, fp);
                break;
        case pdist:
        case pdistn:
                ftt = vis_pdist(pfpsd, pinst, pregs, prw, f.inst.opf);
                break;
        case faligndata:
                ftt = vis_faligndata(pfpsd, pinst, fp);
                break;
        case bshuffle:
                ftt = vis_bshuffle(pfpsd, pinst, fp);
                break;
        case fpadd16:
        case fpadd16s:
        case fpadd32:
        case fpadd32s:
        case fpsub16:
        case fpsub16s:
        case fpsub32:
        case fpsub32s:
                ftt = vis_fpaddsub(pfpsd, f.inst);
                break;
        case fzero:
                lusr = 0;
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fzeros:
                usr = 0;
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fnor:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = ~(lus1 | lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fnors:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = ~(us1 | us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fandnot2:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = (lus1 & ~lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fandnot2s:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = (us1 & ~us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fnot2:
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = ~lus2;
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fnot2s:
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = ~us2;
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fandnot1:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = (~lus1 & lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fandnot1s:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = (~us1 & us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fnot1:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                lusr = ~lus1;
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fnot1s:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                usr = ~us1;
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fxor:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = (lus1 ^ lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fxors:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = (us1 ^ us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fnand:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = ~(lus1 & lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fnands:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = ~(us1 & us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fand:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = (lus1 & lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fands:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = (us1 & us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fxnor:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = ~(lus1 ^ lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fxnors:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = ~(us1 ^ us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fsrc1:
                _fp_unpack_extword(pfpsd, &lusr, nrs1);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fsrc1s:
                _fp_unpack_word(pfpsd, &usr, nrs1);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fornot2:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = (lus1 | ~lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fornot2s:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = (us1 | ~us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fsrc2:
                _fp_unpack_extword(pfpsd, &lusr, nrs2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fsrc2s:
                _fp_unpack_word(pfpsd, &usr, nrs2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fornot1:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = (~lus1 | lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fornot1s:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = (~us1 | us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case for_op:
                _fp_unpack_extword(pfpsd, &lus1, nrs1);
                _fp_unpack_extword(pfpsd, &lus2, nrs2);
                lusr = (lus1 | lus2);
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fors_op:
                _fp_unpack_word(pfpsd, &us1, nrs1);
                _fp_unpack_word(pfpsd, &us2, nrs2);
                usr = (us1 | us2);
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case fone:
                lusr = 0xffffffffffffffff;
                _fp_pack_extword(pfpsd, &lusr, nrd);
                break;
        case fones:
                usr = 0xffffffffUL;
                _fp_pack_word(pfpsd, &usr, nrd);
                break;
        case siam:
                ftt = vis_siam(pfpsd, f.inst, fp);
                break;
        default:
                return (ftt_unimplemented);
        }

        pregs->r_pc = pregs->r_npc;     /* Do not retry emulated instruction. */
        pregs->r_npc += 4;
        return (ftt);
}

/*
 * Simulator for edge instructions
 */
static enum ftt_type
vis_edge(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw)   /* Pointer to locals and ins. */

{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        enum ftt_type ftt;
        uint64_t addrl, addrr, mask;
        uint64_t ah61l, ah61r;          /* Higher 61 bits of address */
        int al3l, al3r;                 /* Lower 3 bits of address */
        uint_t  ccr;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;

        ftt = read_iureg(pfpsd, nrs1, pregs, prw, &addrl);
        if (ftt != ftt_none)
                return (ftt);
        ftt = read_iureg(pfpsd, nrs2, pregs, prw, &addrr);
        if (ftt != ftt_none)
                return (ftt);

        /* Test PSTATE.AM to determine 32-bit vs 64-bit addressing */
        if ((pregs->r_tstate & TSTATE_AM) != 0) {
                ah61l = addrl & 0xfffffff8;
                ah61r = addrr & 0xfffffff8;
        } else {
                ah61l = addrl & ~0x7;
                ah61r = addrr & ~0x7;
        }


        switch (inst.opf) {
        case edge8:
        case edge8n:
        case edge8l:
        case edge8ln:
                al3l = addrl & 0x7;
                switch (inst.opf) {
                case edge8:
                case edge8n:
                        if (inst.opf == edge8) {
                                VISINFO_KSTAT(vis_edge8);
                        } else {
                                VISINFO_KSTAT(vis_edge8n);
                        }
                        mask = 0xff >> al3l;
                        if (ah61l == ah61r) {
                                al3r = addrr & 0x7;
                                mask &= (0xff << (0x7 - al3r)) & 0xff;
                        }
                        break;
                case edge8l:
                case edge8ln:
                        if (inst.opf == edge8l) {
                                VISINFO_KSTAT(vis_edge8l);
                        } else {
                                VISINFO_KSTAT(vis_edge8ln);
                        }
                        mask = (0xff << al3l) & 0xff;
                        if (ah61l == ah61r) {
                                al3r = addrr & 0x7;
                                mask &= 0xff >> (0x7 - al3r);
                        }
                        break;
                }
                break;
        case edge16:
        case edge16l:
        case edge16n:
        case edge16ln:
                al3l = addrl & 0x6;
                al3l >>= 0x1;
                switch (inst.opf) {
                case edge16:
                case edge16n:
                        if (inst.opf == edge16) {
                                VISINFO_KSTAT(vis_edge16);

                        } else {
                                VISINFO_KSTAT(vis_edge16n);
                        }
                        mask = 0xf >> al3l;
                        if (ah61l == ah61r) {
                                al3r = addrr & 0x6;
                                al3r >>= 0x1;
                                mask &= (0xf << (0x3 - al3r)) & 0xf;
                        }
                        break;
                case edge16l:
                case edge16ln:
                        if (inst.opf == edge16l) {
                                VISINFO_KSTAT(vis_edge16l);

                        } else {
                                VISINFO_KSTAT(vis_edge16ln);
                        }

                        mask = (0xf << al3l) & 0xf;
                        if (ah61l == ah61r) {
                                al3r = addrr & 0x6;
                                al3r >>= 0x1;
                                mask &= 0xf >> (0x3 - al3r);
                        }
                        break;
                }
                break;
        case edge32:
        case edge32l:
        case edge32n:
        case edge32ln:
                al3l = addrl & 0x4;
                al3l >>= 0x2;

                switch (inst.opf) {
                case edge32:
                case edge32n:
                        if (inst.opf == edge32) {
                                VISINFO_KSTAT(vis_edge32);

                        } else {
                                VISINFO_KSTAT(vis_edge32n);
                        }
                        mask = 0x3 >> al3l;
                        if (ah61l == ah61r) {
                                al3r = addrr & 0x4;
                                al3r >>= 0x2;
                                mask &= (0x3 << (0x1 - al3r)) & 0x3;
                        }
                        break;
                case edge32l:
                case edge32ln:
                        if (inst.opf == edge32l) {
                                VISINFO_KSTAT(vis_edge32l);

                        } else {
                                VISINFO_KSTAT(vis_edge32ln);
                        }
                        mask = (0x3 << al3l) & 0x3;
                        if (ah61l == ah61r) {
                                al3r = addrr & 0x4;
                                al3r >>= 0x2;
                                mask &= 0x3 >> (0x1 - al3r);
                        }
                        break;
                }
                break;
        }

        ftt = write_iureg(pfpsd, nrd, pregs, prw, &mask);

        switch (inst.opf) {
        case edge8:
        case edge8l:
        case edge16:
        case edge16l:
        case edge32:
        case edge32l:

                /* Update flags per SUBcc outcome */
                pregs->r_tstate &= ~((uint64_t)TSTATE_CCR_MASK
                                        << TSTATE_CCR_SHIFT);
                ccr = get_subcc_ccr(addrl, addrr);  /* get subcc cond. codes */
                pregs->r_tstate |= ((uint64_t)ccr << TSTATE_CCR_SHIFT);

                break;
        }
        return (ftt);
}

/*
 * Simulator for three dimentional array addressing instructions.
 */
static enum ftt_type
vis_array(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw)   /* Pointer to locals and ins. */

{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        enum ftt_type ftt;
        uint64_t laddr, bsize, baddr;
        uint64_t nbit;
        int oy, oz;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;

        ftt = read_iureg(pfpsd, nrs1, pregs, prw, &laddr);
        if (ftt != ftt_none)
                return (ftt);
        ftt = read_iureg(pfpsd, nrs2, pregs, prw, &bsize);
        if (ftt != ftt_none)
                return (ftt);

        if (bsize > 5) {
                bsize = 5;
        }
        nbit = (1 << bsize) - 1;        /* Number of bits for XY<6+n-1:6> */
        oy = 17 + bsize;                /* Offset of Y<6+n-1:6> */
        oz = 17 + 2 * bsize;            /* Offset of Z<8:5> */

        baddr = 0;
        baddr |= (laddr >> (11 -  0)) & (0x03 <<  0);   /* X_integer<1:0> */
        baddr |= (laddr >> (33 -  2)) & (0x03 <<  2);   /* Y_integer<1:0> */
        baddr |= (laddr >> (55 -  4)) & (0x01 <<  4);   /* Z_integer<0>   */
        baddr |= (laddr >> (13 -  5)) & (0x0f <<  5);   /* X_integer<5:2> */
        baddr |= (laddr >> (35 -  9)) & (0x0f <<  9);   /* Y_integer<5:2> */
        baddr |= (laddr >> (56 - 13)) & (0x0f << 13);   /* Z_integer<4:1> */
        baddr |= (laddr >> (17 - 17)) & (nbit << 17);   /* X_integer<6+n-1:6> */
        baddr |= (laddr >> (39 - oy)) & (nbit << oy);   /* Y_integer<6+n-1:6> */
        baddr |= (laddr >> (60 - oz)) & (0x0f << oz);   /* Z_integer<8:5> */

        switch (inst.opf) {
        case array8:
                VISINFO_KSTAT(vis_array8);
                break;
        case array16:
                VISINFO_KSTAT(vis_array16);
                baddr <<= 1;
                break;
        case array32:
                VISINFO_KSTAT(vis_array32);
                baddr <<= 2;
                break;
        }

        ftt = write_iureg(pfpsd, nrd, pregs, prw, &baddr);

        return (ftt);
}

/*
 * Simulator for alignaddr and alignaddrl instructions.
 */
static enum ftt_type
vis_alignaddr(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        kfpu_t          *fp)    /* Need to fp to access gsr reg */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        enum ftt_type ftt;
        uint64_t ea, tea, g, r;
        short s;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;

        ftt = read_iureg(pfpsd, nrs1, pregs, prw, &ea);
        if (ftt != ftt_none)
                return (ftt);
        ftt = read_iureg(pfpsd, nrs2, pregs, prw, &tea);
        if (ftt != ftt_none)
                return (ftt);
        ea += tea;
        r = ea & ~0x7;  /* zero least 3 significant bits */
        ftt = write_iureg(pfpsd, nrd, pregs, prw, &r);


        g = pfpsd->fp_current_read_gsr(fp);
        g &= ~(GSR_ALIGN_MASK);         /* zero the align offset */
        r = ea & 0x7;
        if (inst.opf == alignaddrl) {
                s = (short)(~r);        /* 2's complement for alignaddrl */
                if (s < 0)
                        r = (uint64_t)((s + 1) & 0x7);
                else
                        r = (uint64_t)(s & 0x7);
        }
        g |= (r << GSR_ALIGN_SHIFT) & GSR_ALIGN_MASK;
        pfpsd->fp_current_write_gsr(g, fp);

        return (ftt);
}

/*
 * Simulator for bmask instruction.
 */
static enum ftt_type
vis_bmask(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        kfpu_t          *fp)    /* Need to fp to access gsr reg */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        enum ftt_type ftt;
        uint64_t ea, tea, g;

        VISINFO_KSTAT(vis_bmask);
        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;

        ftt = read_iureg(pfpsd, nrs1, pregs, prw, &ea);
        if (ftt != ftt_none)
                return (ftt);
        ftt = read_iureg(pfpsd, nrs2, pregs, prw, &tea);
        if (ftt != ftt_none)
                return (ftt);
        ea += tea;
        ftt = write_iureg(pfpsd, nrd, pregs, prw, &ea);

        g = pfpsd->fp_current_read_gsr(fp);
        g &= ~(GSR_MASK_MASK);          /* zero the mask offset */

        /* Put the least significant 32 bits of ea in GSR.mask */
        g |= (ea << GSR_MASK_SHIFT) & GSR_MASK_MASK;
        pfpsd->fp_current_write_gsr(g, fp);
        return (ftt);
}

/*
 * Simulator for fp[add|sub]* instruction.
 */
static enum ftt_type
vis_fpaddsub(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst)   /* FPU instruction to simulate. */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        union {
                uint64_t        ll;
                uint32_t        i[2];
                uint16_t        s[4];
        } lrs1, lrs2, lrd;
        union {
                uint32_t        i;
                uint16_t        s[2];
        } krs1, krs2, krd;
        int i;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;
        if ((inst.opf & 1) == 0) {      /* double precision */
                if ((nrs1 & 1) == 1)    /* fix register encoding */
                        nrs1 = (nrs1 & 0x1e) | 0x20;
                if ((nrs2 & 1) == 1)
                        nrs2 = (nrs2 & 0x1e) | 0x20;
                if ((nrd & 1) == 1)
                        nrd = (nrd & 0x1e) | 0x20;
        }
        switch (inst.opf) {
        case fpadd16:
                _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);
                for (i = 0; i <= 3; i++) {
                        lrd.s[i] = lrs1.s[i] + lrs2.s[i];
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fpadd16s:
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                for (i = 0; i <= 1; i++) {
                        krd.s[i] = krs1.s[i] + krs2.s[i];
                }
                _fp_pack_word(pfpsd, &krd.i, nrd);
                break;
        case fpadd32:
                _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);
                for (i = 0; i <= 1; i++) {
                        lrd.i[i] = lrs1.i[i] + lrs2.i[i];
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fpadd32s:
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                krd.i = krs1.i + krs2.i;
                _fp_pack_word(pfpsd, &krd.i, nrd);
                break;
        case fpsub16:
                _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);
                for (i = 0; i <= 3; i++) {
                        lrd.s[i] = lrs1.s[i] - lrs2.s[i];
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fpsub16s:
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                for (i = 0; i <= 1; i++) {
                        krd.s[i] = krs1.s[i] - krs2.s[i];
                }
                _fp_pack_word(pfpsd, &krd.i, nrd);
                break;
        case fpsub32:
                _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);
                for (i = 0; i <= 1; i++) {
                        lrd.i[i] = lrs1.i[i] - lrs2.i[i];
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fpsub32s:
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                krd.i = krs1.i - krs2.i;
                _fp_pack_word(pfpsd, &krd.i, nrd);
                break;
        }
        return (ftt_none);
}

/*
 * Simulator for fcmp* instruction.
 */
static enum ftt_type
vis_fcmp(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw)   /* Pointer to locals and ins. */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        union {
                uint64_t        ll;
                uint32_t        i[2];
                uint16_t        s[4];
        } krs1, krs2, krd;
        enum ftt_type ftt;
        short sr1, sr2;
        int i, ir1, ir2;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;
        krd.ll = 0;
        if ((nrs1 & 1) == 1)    /* fix register encoding */
                nrs1 = (nrs1 & 0x1e) | 0x20;
        if ((nrs2 & 1) == 1)
                nrs2 = (nrs2 & 0x1e) | 0x20;

        _fp_unpack_extword(pfpsd, &krs1.ll, nrs1);
        _fp_unpack_extword(pfpsd, &krs2.ll, nrs2);
        switch (inst.opf) {
        case fcmple16:
                VISINFO_KSTAT(vis_fcmple16);
                for (i = 0; i <= 3; i++) {
                        sr1 = (short)krs1.s[i];
                        sr2 = (short)krs2.s[i];
                        if (sr1 <= sr2)
                                krd.ll += (0x8 >> i);
                }
                break;
        case fcmpne16:
                VISINFO_KSTAT(vis_fcmpne16);
                for (i = 0; i <= 3; i++) {
                        sr1 = (short)krs1.s[i];
                        sr2 = (short)krs2.s[i];
                        if (sr1 != sr2)
                                krd.ll += (0x8 >> i);
                }
                break;
        case fcmpgt16:
                VISINFO_KSTAT(vis_fcmpgt16);
                for (i = 0; i <= 3; i++) {
                        sr1 = (short)krs1.s[i];
                        sr2 = (short)krs2.s[i];
                        if (sr1 > sr2)
                                krd.ll += (0x8 >> i);
                }
                break;
        case fcmpeq16:
                VISINFO_KSTAT(vis_fcmpeq16);
                for (i = 0; i <= 3; i++) {
                        sr1 = (short)krs1.s[i];
                        sr2 = (short)krs2.s[i];
                        if (sr1 == sr2)
                                krd.ll += (0x8 >> i);
                }
                break;
        case fcmple32:
                VISINFO_KSTAT(vis_fcmple32);
                for (i = 0; i <= 1; i++) {
                        ir1 = (int)krs1.i[i];
                        ir2 = (int)krs2.i[i];
                        if (ir1 <= ir2)
                                krd.ll += (0x2 >> i);
                }
                break;
        case fcmpne32:
                VISINFO_KSTAT(vis_fcmpne32);
                for (i = 0; i <= 1; i++) {
                        ir1 = (int)krs1.i[i];
                        ir2 = (int)krs2.i[i];
                        if (ir1 != ir2)
                                krd.ll += (0x2 >> i);
                }
                break;
        case fcmpgt32:
                VISINFO_KSTAT(vis_fcmpgt32);
                for (i = 0; i <= 1; i++) {
                        ir1 = (int)krs1.i[i];
                        ir2 = (int)krs2.i[i];
                        if (ir1 > ir2)
                                krd.ll += (0x2 >> i);
                }
                break;
        case fcmpeq32:
                VISINFO_KSTAT(vis_fcmpeq32);
                for (i = 0; i <= 1; i++) {
                        ir1 = (int)krs1.i[i];
                        ir2 = (int)krs2.i[i];
                        if (ir1 == ir2)
                                krd.ll += (0x2 >> i);
                }
                break;
        }
        ftt = write_iureg(pfpsd, nrd, pregs, prw, &krd.ll);
        return (ftt);
}

/*
 * Simulator for fmul* instruction.
 */
static enum ftt_type
vis_fmul(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst)   /* FPU instruction to simulate. */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        union {
                uint64_t        ll;
                uint32_t        i[2];
                uint16_t        s[4];
                uint8_t         c[8];
        } lrs1, lrs2, lrd;
        union {
                uint32_t        i;
                uint16_t        s[2];
                uint8_t         c[4];
        } krs1, krs2, kres;
        short s1, s2, sres;
        ushort_t us1;
        char c1;
        int i;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;
        if ((inst.opf & 1) == 0) {      /* double precision */
                if ((nrd & 1) == 1)     /* fix register encoding */
                        nrd = (nrd & 0x1e) | 0x20;
        }

        switch (inst.opf) {
        case fmul8x16:
                VISINFO_KSTAT(vis_fmul8x16);
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                if ((nrs2 & 1) == 1)
                        nrs2 = (nrs2 & 0x1e) | 0x20;
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);
                for (i = 0; i <= 3; i++) {
                        us1 = (ushort_t)krs1.c[i];
                        s2 = (short)lrs2.s[i];
                        kres.i = us1 * s2;
                        sres = (short)((kres.c[1] << 8) | kres.c[2]);
                        if (kres.c[3] >= 0x80)
                                sres++;
                        lrd.s[i] = sres;
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fmul8x16au:
                VISINFO_KSTAT(vis_fmul8x16au);
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                for (i = 0; i <= 3; i++) {
                        us1 = (ushort_t)krs1.c[i];
                        s2 = (short)krs2.s[0];
                        kres.i = us1 * s2;
                        sres = (short)((kres.c[1] << 8) | kres.c[2]);
                        if (kres.c[3] >= 0x80)
                                sres++;
                        lrd.s[i] = sres;
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fmul8x16al:
                VISINFO_KSTAT(vis_fmul8x16al);
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                for (i = 0; i <= 3; i++) {
                        us1 = (ushort_t)krs1.c[i];
                        s2 = (short)krs2.s[1];
                        kres.i = us1 * s2;
                        sres = (short)((kres.c[1] << 8) | kres.c[2]);
                        if (kres.c[3] >= 0x80)
                                sres++;
                        lrd.s[i] = sres;
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fmul8sux16:
                VISINFO_KSTAT(vis_fmul8sux16);
                if ((nrs1 & 1) == 1)    /* fix register encoding */
                        nrs1 = (nrs1 & 0x1e) | 0x20;
                _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
                if ((nrs2 & 1) == 1)
                        nrs2 = (nrs2 & 0x1e) | 0x20;
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);
                for (i = 0; i <= 3; i++) {
                        c1 = lrs1.c[(i*2)];
                        s1 = (short)c1;         /* keeps the sign alive */
                        s2 = (short)lrs2.s[i];
                        kres.i = s1 * s2;
                        sres = (short)((kres.c[1] << 8) | kres.c[2]);
                        if (kres.c[3] >= 0x80)
                                sres++;
                        if (sres < 0)
                                lrd.s[i] = (sres & 0xFFFF);
                        else
                                lrd.s[i] = sres;
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fmul8ulx16:
                VISINFO_KSTAT(vis_fmul8ulx16);
                if ((nrs1 & 1) == 1)    /* fix register encoding */
                        nrs1 = (nrs1 & 0x1e) | 0x20;
                _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
                if ((nrs2 & 1) == 1)
                        nrs2 = (nrs2 & 0x1e) | 0x20;
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);
                for (i = 0; i <= 3; i++) {
                        us1 = (ushort_t)lrs1.c[(i*2)+1];
                        s2 = (short)lrs2.s[i];
                        kres.i = us1 * s2;
                        sres = (short)kres.s[0];
                        if (kres.s[1] >= 0x8000)
                                sres++;
                        lrd.s[i] = sres;
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fmuld8sux16:
                VISINFO_KSTAT(vis_fmuld8sux16);
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                for (i = 0; i <= 1; i++) {
                        c1 = krs1.c[(i*2)];
                        s1 = (short)c1;         /* keeps the sign alive */
                        s2 = (short)krs2.s[i];
                        kres.i = s1 * s2;
                        lrd.i[i] = kres.i << 8;
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fmuld8ulx16:
                VISINFO_KSTAT(vis_fmuld8ulx16);
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                for (i = 0; i <= 1; i++) {
                        us1 = (ushort_t)krs1.c[(i*2)+1];
                        s2 = (short)krs2.s[i];
                        lrd.i[i] = us1 * s2;
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        }
        return (ftt_none);
}

/*
 * Simulator for fpixel formatting instructions.
 */
static enum ftt_type
vis_fpixel(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* FPU instruction to simulate. */
        kfpu_t          *fp)    /* Need to fp to access gsr reg */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        int     i, j, k, sf;
        union {
                uint64_t        ll;
                uint32_t        i[2];
                uint16_t        s[4];
                uint8_t         c[8];
        } lrs1, lrs2, lrd;
        union {
                uint32_t        i;
                uint16_t        s[2];
                uint8_t         c[4];
        } krs1, krs2, krd;
        uint64_t r;
        int64_t l, m;
        short s;
        uchar_t uc;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;
        if ((inst.opf != fpack16) && (inst.opf != fpackfix)) {
                if ((nrd & 1) == 1)     /* fix register encoding */
                        nrd = (nrd & 0x1e) | 0x20;
        }

        switch (inst.opf) {
        case fpack16:
                VISINFO_KSTAT(vis_fpack16);
                if ((nrs2 & 1) == 1)    /* fix register encoding */
                        nrs2 = (nrs2 & 0x1e) | 0x20;
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);
                r = pfpsd->fp_current_read_gsr(fp);
                /* fpack16 ignores GSR.scale msb */
                sf = (int)(GSR_SCALE(r) & 0xf);
                for (i = 0; i <= 3; i++) {
                        s = (short)lrs2.s[i];   /* preserve the sign */
                        j = ((int)s << sf);
                        k = j >> 7;
                        if (k < 0) {
                                uc = 0;
                        } else if (k > 255) {
                                uc = 255;
                        } else {
                                uc = (uchar_t)k;
                        }
                        krd.c[i] = uc;
                }
                _fp_pack_word(pfpsd, &krd.i, nrd);
                break;
        case fpack32:
                VISINFO_KSTAT(vis_fpack32);
                if ((nrs1 & 1) == 1)    /* fix register encoding */
                        nrs1 = (nrs1 & 0x1e) | 0x20;
                _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
                if ((nrs2 & 1) == 1)
                        nrs2 = (nrs2 & 0x1e) | 0x20;
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);

                r = pfpsd->fp_current_read_gsr(fp);
                sf = (int)GSR_SCALE(r);
                lrd.ll = lrs1.ll << 8;
                for (i = 0, k = 3; i <= 1; i++, k += 4) {
                        j = (int)lrs2.i[i];     /* preserve the sign */
                        l = ((int64_t)j << sf);
                        m = l >> 23;
                        if (m < 0) {
                                uc = 0;
                        } else if (m > 255) {
                                uc = 255;
                        } else {
                                uc = (uchar_t)m;
                        }
                        lrd.c[k] = uc;
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fpackfix:
                VISINFO_KSTAT(vis_fpackfix);
                if ((nrs2 & 1) == 1)
                        nrs2 = (nrs2 & 0x1e) | 0x20;
                _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);

                r = pfpsd->fp_current_read_gsr(fp);
                sf = (int)GSR_SCALE(r);
                for (i = 0; i <= 1; i++) {
                        j = (int)lrs2.i[i];     /* preserve the sign */
                        l = ((int64_t)j << sf);
                        m = l >> 16;
                        if (m < -32768) {
                                s = -32768;
                        } else if (m > 32767) {
                                s = 32767;
                        } else {
                                s = (short)m;
                        }
                        krd.s[i] = s;
                }
                _fp_pack_word(pfpsd, &krd.i, nrd);
                break;
        case fexpand:
                VISINFO_KSTAT(vis_fexpand);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                for (i = 0; i <= 3; i++) {
                        uc = krs2.c[i];
                        lrd.s[i] = (ushort_t)(uc << 4);
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        case fpmerge:
                VISINFO_KSTAT(vis_fpmerge);
                _fp_unpack_word(pfpsd, &krs1.i, nrs1);
                _fp_unpack_word(pfpsd, &krs2.i, nrs2);
                for (i = 0, j = 0; i <= 3; i++, j += 2) {
                        lrd.c[j] = krs1.c[i];
                        lrd.c[j+1] = krs2.c[i];
                }
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
                break;
        }
        return (ftt_none);
}

/*
 * Simulator for pdist instruction.
 */
enum ftt_type
vis_pdist(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        fp_inst_type    pinst,  /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        uint_t          op)     /* Opcode pdist or pdistn */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        int     i;
        short   s;
        union {
                uint64_t        ll;
                uint8_t         c[8];
        } lrs1, lrs2, lrd;

        nrs1 = pinst.rs1;
        nrs2 = pinst.rs2;
        nrd = pinst.rd;

        if ((nrs1 & 1) == 1)            /* fix register encoding */
                nrs1 = (nrs1 & 0x1e) | 0x20;
        if ((nrs2 & 1) == 1)
                nrs2 = (nrs2 & 0x1e) | 0x20;
        if ((nrd & 1) == 1)
                nrd = (nrd & 0x1e) | 0x20;

        _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
        _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);

        if (op == pdist) {
                VISINFO_KSTAT(vis_pdist);
                _fp_unpack_extword(pfpsd, &lrd.ll, nrd);
        } else {
                /* pdistn */
                VISINFO_KSTAT(vis_pdistn);
                lrd.ll = 0;
        }

        for (i = 0; i <= 7; i++) {
                s = (short)(lrs1.c[i] - lrs2.c[i]);
                if (s < 0)
                        s = ~s + 1;
                lrd.ll += s;
        }

        if (op == pdist)
                _fp_pack_extword(pfpsd, &lrd.ll, nrd);
        else
                /* pdistn */
                (void) write_iureg(pfpsd, nrd, pregs, prw, &lrd.ll);
        return (ftt_none);
}

/*
 * Simulator for faligndata instruction.
 */
static enum ftt_type
vis_faligndata(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        fp_inst_type    pinst,  /* FPU instruction to simulate. */
        kfpu_t          *fp)    /* Need to fp to access gsr reg */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        int     i, j, k, ao;
        union {
                uint64_t        ll;
                uint8_t         c[8];
        } lrs1, lrs2, lrd;
        uint64_t r;

        nrs1 = pinst.rs1;
        nrs2 = pinst.rs2;
        nrd = pinst.rd;
        if ((nrs1 & 1) == 1)            /* fix register encoding */
                nrs1 = (nrs1 & 0x1e) | 0x20;
        if ((nrs2 & 1) == 1)
                nrs2 = (nrs2 & 0x1e) | 0x20;
        if ((nrd & 1) == 1)
                nrd = (nrd & 0x1e) | 0x20;

        _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
        _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);

        r = pfpsd->fp_current_read_gsr(fp);
        ao = (int)GSR_ALIGN(r);

        for (i = 0, j = ao, k = 0; i <= 7; i++)
                if (j <= 7) {
                        lrd.c[i] = lrs1.c[j++];
                } else {
                        lrd.c[i] = lrs2.c[k++];
                }
        _fp_pack_extword(pfpsd, &lrd.ll, nrd);

        return (ftt_none);
}

/*
 * Simulator for bshuffle instruction.
 */
static enum ftt_type
vis_bshuffle(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        fp_inst_type    pinst,  /* FPU instruction to simulate. */
        kfpu_t          *fp)    /* Need to fp to access gsr reg */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        int     i, j, ao;
        union {
                uint64_t        ll;
                uint8_t         c[8];
        } lrs1, lrs2, lrd;
        uint64_t r;

        VISINFO_KSTAT(vis_bshuffle);
        nrs1 = pinst.rs1;
        nrs2 = pinst.rs2;
        nrd = pinst.rd;
        if ((nrs1 & 1) == 1)            /* fix register encoding */
                nrs1 = (nrs1 & 0x1e) | 0x20;
        if ((nrs2 & 1) == 1)
                nrs2 = (nrs2 & 0x1e) | 0x20;
        if ((nrd & 1) == 1)
                nrd = (nrd & 0x1e) | 0x20;

        _fp_unpack_extword(pfpsd, &lrs1.ll, nrs1);
        _fp_unpack_extword(pfpsd, &lrs2.ll, nrs2);

        r = pfpsd->fp_current_read_gsr(fp);
        ao = (int)GSR_MASK(r);

        /*
         * BSHUFFLE Destination Byte Selection
         * rd Byte      Source
         * 0            rs byte[GSR.mask<31..28>]
         * 1            rs byte[GSR.mask<27..24>]
         * 2            rs byte[GSR.mask<23..20>]
         * 3            rs byte[GSR.mask<19..16>]
         * 4            rs byte[GSR.mask<15..12>]
         * 5            rs byte[GSR.mask<11..8>]
         * 6            rs byte[GSR.mask<7..4>]
         * 7            rs byte[GSR.mask<3..0>]
         * P.S. rs1 is the upper half and rs2 is the lower half
         * Bytes in the source value are numbered from most to
         * least significant
         */
        for (i = 7; i >= 0; i--, ao = (ao >> 4)) {
                j = ao & 0xf;           /* get byte number */
                if (j < 8) {
                        lrd.c[i] = lrs1.c[j];
                } else {
                        lrd.c[i] = lrs2.c[j - 8];
                }
        }
        _fp_pack_extword(pfpsd, &lrd.ll, nrd);

        return (ftt_none);
}

/*
 * Simulator for siam instruction.
 */
static enum ftt_type
vis_siam(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* FPU instruction to simulate. */
        kfpu_t          *fp)    /* Need to fp to access gsr reg */
{
        uint_t  nrs2;                   /* Register number fields. */
        uint64_t g, r;
        nrs2 = inst.rs2;

        g = pfpsd->fp_current_read_gsr(fp);
        g &= ~(GSR_IM_IRND_MASK);       /* zero the IM and IRND fields */
        r = nrs2 & 0x7;                 /* get mode(3 bit) */
        g |= (r << GSR_IRND_SHIFT);
        pfpsd->fp_current_write_gsr(g, fp);
        return (ftt_none);
}

/*
 * Simulator for VIS loads and stores between floating-point unit and memory.
 */
enum ftt_type
vis_fldst(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        fp_inst_type    pinst,  /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        uint_t          asi)    /* asi to emulate! */
{
        union {
                vis_inst_type   inst;
                fp_inst_type    pinst;
        } i;

        ASSERT(USERMODE(pregs->r_tstate));
        i.pinst = pinst;
        switch (asi) {
                case ASI_PST8_P:
                case ASI_PST8_S:
                case ASI_PST16_P:
                case ASI_PST16_S:
                case ASI_PST32_P:
                case ASI_PST32_S:
                case ASI_PST8_PL:
                case ASI_PST8_SL:
                case ASI_PST16_PL:
                case ASI_PST16_SL:
                case ASI_PST32_PL:
                case ASI_PST32_SL:
                        return (vis_prtl_fst(pfpsd, i.inst, pregs,
                            prw, asi));
                case ASI_FL8_P:
                case ASI_FL8_S:
                case ASI_FL8_PL:
                case ASI_FL8_SL:
                case ASI_FL16_P:
                case ASI_FL16_S:
                case ASI_FL16_PL:
                case ASI_FL16_SL:
                        return (vis_short_fls(pfpsd, i.inst, pregs,
                            prw, asi));
                case ASI_BLK_AIUP:
                case ASI_BLK_AIUS:
                case ASI_BLK_AIUPL:
                case ASI_BLK_AIUSL:
                case ASI_BLK_P:
                case ASI_BLK_S:
                case ASI_BLK_PL:
                case ASI_BLK_SL:
                case ASI_BLK_COMMIT_P:
                case ASI_BLK_COMMIT_S:
                        return (vis_blk_fldst(pfpsd, i.inst, pregs,
                            prw, asi));
                default:
                        return (ftt_unimplemented);
        }
}

/*
 * Simulator for partial stores between floating-point unit and memory.
 */
static enum ftt_type
vis_prtl_fst(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* ISE instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        uint_t          asi)    /* asi to emulate! */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        uint_t  opf, msk;
        int     h, i, j;
        uint64_t ea, tmsk;
        union {
                freg_type       f;
                uint64_t        ll;
                uint32_t        i[2];
                uint16_t        s[4];
                uint8_t         c[8];
        } k, l, res;
        enum ftt_type   ftt;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;
        if ((nrd & 1) == 1)             /* fix register encoding */
                nrd = (nrd & 0x1e) | 0x20;
        opf = inst.opf;
        res.ll = 0;
        if ((opf & 0x100) == 0) {       /* effective address = rs1  */
                ftt = read_iureg(pfpsd, nrs1, pregs, prw, &ea);
                if (ftt != ftt_none)
                        return (ftt);
                ftt = read_iureg(pfpsd, nrs2, pregs, prw, &tmsk);
                if (ftt != ftt_none)
                        return (ftt);
                msk = (uint_t)tmsk;
        } else {
                pfpsd->fp_trapaddr = (caddr_t)pregs->r_pc;
                return (ftt_unimplemented);
        }

        pfpsd->fp_trapaddr = (caddr_t)ea; /* setup bad addr in case we trap */
        if ((ea & 0x3) != 0)
                return (ftt_alignment); /* Require 32 bit-alignment. */

        switch (asi) {
        case ASI_PST8_P:
        case ASI_PST8_S:
                ftt = _fp_read_extword((uint64_t *)ea, &l.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                for (i = 0, j = 0x80; i <= 7; i++, j >>= 1) {
                        if ((msk & j) == j)
                                res.c[i] = k.c[i];
                        else
                                res.c[i] = l.c[i];
                }
                ftt = _fp_write_extword((uint64_t *)ea, res.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                break;
        case ASI_PST8_PL:       /* little-endian */
        case ASI_PST8_SL:
                ftt = _fp_read_extword((uint64_t *)ea, &l.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                for (h = 7, i = 0, j = 1; i <= 7; h--, i++, j <<= 1) {
                        if ((msk & j) == j)
                                res.c[i] = k.c[h];
                        else
                                res.c[i] = l.c[i];
                }
                ftt = _fp_write_extword((uint64_t *)ea, res.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                break;
        case ASI_PST16_P:
        case ASI_PST16_S:
                ftt = _fp_read_extword((uint64_t *)ea, &l.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                for (i = 0, j = 0x8; i <= 3; i++, j >>= 1) {
                        if ((msk & j) == j)
                                res.s[i] = k.s[i];
                        else
                                res.s[i] = l.s[i];
                }
                ftt = _fp_write_extword((uint64_t *)ea, res.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                break;
        case ASI_PST16_PL:
        case ASI_PST16_SL:
                ftt = _fp_read_extword((uint64_t *)ea, &l.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                for (h = 7, i = 0, j = 1; i <= 6; h -= 2, i += 2, j <<= 1) {
                        if ((msk & j) == j) {
                                res.c[i] = k.c[h];
                                res.c[i+1] = k.c[h-1];
                        } else {
                                res.c[i] = l.c[i];
                                res.c[i+1] = l.c[i+1];
                        }
                }
                ftt = _fp_write_extword((uint64_t *)ea, res.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                break;
        case ASI_PST32_P:
        case ASI_PST32_S:
                ftt = _fp_read_extword((uint64_t *)ea, &l.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                for (i = 0, j = 0x2; i <= 1; i++, j >>= 1) {
                        if ((msk & j) == j)
                                res.i[i] = k.i[i];
                        else
                                res.i[i] = l.i[i];
                }
                ftt = _fp_write_extword((uint64_t *)ea, res.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                break;
        case ASI_PST32_PL:
        case ASI_PST32_SL:
                ftt = _fp_read_extword((uint64_t *)ea, &l.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                for (h = 7, i = 0, j = 1; i <= 4; h -= 4, i += 4, j <<= 1) {
                        if ((msk & j) == j) {
                                res.c[i] = k.c[h];
                                res.c[i+1] = k.c[h-1];
                                res.c[i+2] = k.c[h-2];
                                res.c[i+3] = k.c[h-3];
                        } else {
                                res.c[i] = l.c[i];
                                res.c[i+1] = l.c[i+1];
                                res.c[i+2] = l.c[i+2];
                                res.c[i+3] = l.c[i+3];
                        }
                }
                ftt = _fp_write_extword((uint64_t *)ea, res.ll, pfpsd);
                if (ftt != ftt_none)
                        return (ftt);
                break;
        }

        pregs->r_pc = pregs->r_npc;     /* Do not retry emulated instruction. */
        pregs->r_npc += 4;
        return (ftt_none);
}

/*
 * Simulator for short load/stores between floating-point unit and memory.
 */
static enum ftt_type
vis_short_fls(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* ISE instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        uint_t          asi)    /* asi to emulate! */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        uint_t  opf;
        uint64_t ea, tea;
        union {
                freg_type       f;
                uint64_t        ll;
                uint32_t        i[2];
                uint16_t        s[4];
                uint8_t         c[8];
        } k;
        union {
                vis_inst_type   inst;
                int             i;
        } fp;
        enum ftt_type   ftt = ftt_none;
        ushort_t us;
        uchar_t uc;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;
        if ((nrd & 1) == 1)             /* fix register encoding */
                nrd = (nrd & 0x1e) | 0x20;
        opf = inst.opf;
        fp.inst = inst;
        if ((opf & 0x100) == 0) { /* effective address = rs1 + rs2 */
                ftt = read_iureg(pfpsd, nrs1, pregs, prw, &ea);
                if (ftt != ftt_none)
                        return (ftt);
                ftt = read_iureg(pfpsd, nrs2, pregs, prw, &tea);
                if (ftt != ftt_none)
                        return (ftt);
                ea += tea;
        } else {        /* effective address = rs1 + imm13 */
                fp.inst = inst;
                ea = (fp.i << 19) >> 19;        /* Extract simm13 field. */
                ftt = read_iureg(pfpsd, nrs1, pregs, prw, &tea);
                if (ftt != ftt_none)
                        return (ftt);
                ea += tea;
        }
        if (get_udatamodel() == DATAMODEL_ILP32)
                ea = (uint64_t)(caddr32_t)ea;

        pfpsd->fp_trapaddr = (caddr_t)ea; /* setup bad addr in case we trap */
        switch (asi) {
        case ASI_FL8_P:
        case ASI_FL8_S:
        case ASI_FL8_PL:                /* little-endian */
        case ASI_FL8_SL:
                if ((inst.op3 & 7) == 3) {      /* load byte */
                        if (fuword8((void *)ea, &uc) == -1)
                                return (ftt_fault);
                        k.ll = 0;
                        k.c[7] = uc;
                        _fp_pack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                } else {                        /* store byte */
                        _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                        uc = k.c[7];
                        if (subyte((caddr_t)ea, uc) == -1)
                                return (ftt_fault);
                }
                break;
        case ASI_FL16_P:
        case ASI_FL16_S:
                if ((ea & 1) == 1)
                        return (ftt_alignment);
                if ((inst.op3 & 7) == 3) {      /* load short */
                        if (fuword16((void *)ea, &us) == -1)
                                return (ftt_fault);
                        k.ll = 0;
                        k.s[3] = us;
                        _fp_pack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                } else {                        /* store short */
                        _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                        us = k.s[3];
                        if (suword16((caddr_t)ea, us) == -1)
                                return (ftt_fault);
                }
                break;
        case ASI_FL16_PL:               /* little-endian */
        case ASI_FL16_SL:
                if ((ea & 1) == 1)
                        return (ftt_alignment);
                if ((inst.op3 & 7) == 3) {      /* load short */
                        if (fuword16((void *)ea, &us) == -1)
                                return (ftt_fault);
                        k.ll = 0;
                        k.c[6] = (uchar_t)us;
                        k.c[7] = (uchar_t)((us & 0xff00) >> 8);
                        _fp_pack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                } else {                        /* store short */
                        _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD, nrd);
                        uc = k.c[7];
                        us = (ushort_t)((uc << 8) | k.c[6]);
                        if (suword16((void *)ea, us) == -1)
                                return (ftt_fault);
                }
                break;
        }

        pregs->r_pc = pregs->r_npc;     /* Do not retry emulated instruction. */
        pregs->r_npc += 4;
        return (ftt_none);
}

/*
 * Simulator for block loads and stores between floating-point unit and memory.
 * We pass the addrees of ea to sync_data_memory() to flush the Ecache.
 * Sync_data_memory() calls platform dependent code to flush the Ecache.
 */
static enum ftt_type
vis_blk_fldst(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        vis_inst_type   inst,   /* ISE instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        uint_t          asi)    /* asi to emulate! */
{
        uint_t  nrs1, nrs2, nrd;        /* Register number fields. */
        uint_t  opf, h, i, j;
        uint64_t ea, tea;
        union {
                freg_type       f;
                uint64_t        ll;
                uint8_t         c[8];
        } k, l;
        union {
                vis_inst_type   inst;
                int32_t         i;
        } fp;
        enum ftt_type   ftt;
        boolean_t little_endian = B_FALSE;

        nrs1 = inst.rs1;
        nrs2 = inst.rs2;
        nrd = inst.rd;
        if ((nrd & 1) == 1)             /* fix register encoding */
                nrd = (nrd & 0x1e) | 0x20;

        /* ensure register is 8-double precision aligned */
        if ((nrd & 0xf) != 0)
                return (ftt_unimplemented);

        opf = inst.opf;
        if ((opf & 0x100) == 0) {       /* effective address = rs1 + rs2 */
                ftt = read_iureg(pfpsd, nrs1, pregs, prw, &ea);
                if (ftt != ftt_none)
                        return (ftt);
                ftt = read_iureg(pfpsd, nrs2, pregs, prw, &tea);
                if (ftt != ftt_none)
                        return (ftt);
                ea += tea;
        } else {                        /* effective address = rs1 + imm13 */
                fp.inst = inst;
                ea = (fp.i << 19) >> 19;        /* Extract simm13 field. */
                ftt = read_iureg(pfpsd, nrs1, pregs, prw, &tea);
                if (ftt != ftt_none)
                        return (ftt);
                ea += tea;
        }
        if ((ea & 0x3F) != 0)           /* Require 64 byte-alignment. */
                return (ftt_alignment);

        pfpsd->fp_trapaddr = (caddr_t)ea; /* setup bad addr in case we trap */
        switch (asi) {
        case ASI_BLK_AIUPL:
        case ASI_BLK_AIUSL:
        case ASI_BLK_PL:
        case ASI_BLK_SL:
                little_endian = B_TRUE;
                /* FALLTHROUGH */
        case ASI_BLK_AIUP:
        case ASI_BLK_AIUS:
        case ASI_BLK_P:
        case ASI_BLK_S:
        case ASI_BLK_COMMIT_P:
        case ASI_BLK_COMMIT_S:
                if ((inst.op3 & 7) == 3) {      /* lddf */
                        for (i = 0; i < 8; i++, nrd += 2) {
                                ftt = _fp_read_extword((uint64_t *)ea, &k.ll,
                                    pfpsd);
                                if (ftt != ftt_none)
                                        return (ftt);
                                if (little_endian) {
                                        for (j = 0, h = 7; j < 8; j++, h--)
                                                l.c[h] = k.c[j];
                                        k.ll = l.ll;
                                }
                                _fp_pack_extword(pfpsd, &k.f.FPU_DREG_FIELD,
                                    nrd);
                                ea += 8;
                        }
                } else {                        /* stdf */
                        for (i = 0; i < 8; i++, nrd += 2) {
                                _fp_unpack_extword(pfpsd, &k.f.FPU_DREG_FIELD,
                                    nrd);
                                if (little_endian) {
                                        for (j = 0, h = 7; j < 8; j++, h--)
                                                l.c[h] = k.c[j];
                                        k.ll = l.ll;
                                }
                                ftt = _fp_write_extword((uint64_t *)ea, k.ll,
                                    pfpsd);
                                if (ftt != ftt_none)
                                        return (ftt);
                                ea += 8;
                        }
                }
                if ((asi == ASI_BLK_COMMIT_P) || (asi == ASI_BLK_COMMIT_S))
                        sync_data_memory((caddr_t)(ea - 64), 64);
                break;
        default:
                /* addr of unimp inst */
                pfpsd->fp_trapaddr = (caddr_t)pregs->r_pc;
                return (ftt_unimplemented);
        }

        pregs->r_pc = pregs->r_npc;     /* Do not retry emulated instruction. */
        pregs->r_npc += 4;
        return (ftt_none);
}

/*
 * Simulator for rd %gsr instruction.
 */
enum ftt_type
vis_rdgsr(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        fp_inst_type    pinst,  /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        kfpu_t          *fp)    /* Need to fp to access gsr reg */
{
        uint_t nrd;
        uint64_t r;
        enum ftt_type ftt = ftt_none;

        nrd = pinst.rd;

        r = pfpsd->fp_current_read_gsr(fp);
        ftt = write_iureg(pfpsd, nrd, pregs, prw, &r);
        pregs->r_pc = pregs->r_npc;     /* Do not retry emulated instruction. */
        pregs->r_npc += 4;
        return (ftt);
}

/*
 * Simulator for wr %gsr instruction.
 */
enum ftt_type
vis_wrgsr(
        fp_simd_type    *pfpsd, /* FPU simulator data. */
        fp_inst_type    pinst,  /* FPU instruction to simulate. */
        struct regs     *pregs, /* Pointer to PCB image of registers. */
        void            *prw,   /* Pointer to locals and ins. */
        kfpu_t          *fp)    /* Need to fp to access gsr reg */
{
        uint_t nrs1;
        uint64_t r, r1, r2;
        enum ftt_type ftt = ftt_none;

        nrs1 = pinst.rs1;
        ftt = read_iureg(pfpsd, nrs1, pregs, prw, &r1);
        if (ftt != ftt_none)
                return (ftt);
        if (pinst.ibit == 0) {  /* copy the value in r[rs2] */
                uint_t nrs2;

                nrs2 = pinst.rs2;
                ftt = read_iureg(pfpsd, nrs2, pregs, prw, &r2);
                if (ftt != ftt_none)
                        return (ftt);
        } else {        /* use sign_ext(simm13) */
                union {
                        fp_inst_type    inst;
                        uint32_t        i;
                } fp;

                fp.inst = pinst;                /* Extract simm13 field */
                r2 = (fp.i << 19) >> 19;
        }
        r = r1 ^ r2;
        pfpsd->fp_current_write_gsr(r, fp);
        pregs->r_pc = pregs->r_npc;     /* Do not retry emulated instruction. */
        pregs->r_npc += 4;
        return (ftt);
}

/*
 * This is the loadable module wrapper.
 */
#include <sys/errno.h>
#include <sys/modctl.h>

/*
 * Module linkage information for the kernel.
 */
extern struct mod_ops mod_miscops;

static struct modlmisc modlmisc = {
        &mod_miscops,
        "vis fp simulation",
};

static struct modlinkage modlinkage = {
        MODREV_1, (void *)&modlmisc, NULL
};

int
_init(void)
{
        return (mod_install(&modlinkage));
}

int
_info(struct modinfo *modinfop)
{
        return (mod_info(&modlinkage, modinfop));
}