root/src/add-ons/accelerants/matrox/engine/mga_acc.c
/* MGA Acceleration functions */
/* Authors:
   Mark Watson 2/2000,
   Rudolf Cornelissen 10/2002-1/2006.
*/

#define MODULE_BIT 0x00080000

#include "mga_std.h"

/*acceleration notes*/

/*functions Be's app_server uses:
fill span (horizontal only)
fill rectangle (these 2 are very similar)
invert rectangle 
blit
*/

/* needed by MIL 1/2 because of adress linearisation constraints */
#define ACCW_YDSTLEN(dst, len) do { \
        if (si->engine.y_lin) { \
                ACCW(YDST,((dst)* (si->fbc.bytes_per_row / (si->engine.depth >> 3))) >> 5); \
                ACCW(LEN,len); \
        } else ACCW(YDSTLEN,((dst)<<16)|(len)); \
} while (0)

status_t gx00_acc_wait_idle()
{
        /* wait until engine completely idle */
        while (ACCR(STATUS) & 0x00010000)
        {
                /* snooze a bit so I do not hammer the bus */
                snooze (100); 
        }

        return B_OK;
}

/* AFAIK this must be done for every new screenmode.
 * Engine required init. */
status_t gx00_acc_init()
{
        /* used for convenience: MACCESS is a write only register! */
        uint32 maccess = 0x00000000;
        /* if we were unable to read PINS, we have to assume something (keeping bit6 zero) */
        if ((si->ps.card_type >= G450) && (si->ps.pins_status == B_OK))
        {
                /* b7 v5_mem_type = done by Mark Watson. fixme: still confirm! (unknown bits) */
                maccess |= ((((uint32)si->ps.v5_mem_type) & 0x80) >> 1);
        }

        /* preset using hardware adress linearisation */
        si->engine.y_lin = 0x00;
        /* reset depth */
        si->engine.depth = 0;

        /* cleanup bitblt */
        ACCW(OPMODE,0);

        /* Set the Z origin to the start of FB (otherwise lockup on blits) */
        ACCW(ZORG,0);

        /* Set pixel width */
        switch(si->dm.space)
        {
        case B_CMAP8:
                ACCW(MACCESS, ((maccess & 0xfffffffc) | 0x00));
                si->engine.depth = 8;
                break;
        case B_RGB15_LITTLE:case B_RGB16_LITTLE:
                ACCW(MACCESS, ((maccess & 0xfffffffc) | 0x01)); 
                si->engine.depth = 16;
                break;
        case B_RGB32_LITTLE:case B_RGBA32_LITTLE:
                ACCW(MACCESS, ((maccess & 0xfffffffc) | 0x02));
                si->engine.depth = 32;
                break;
        default:
                LOG(8,("ACC: init, invalid bit depth\n"));
                return B_ERROR;
        }

        /* setup PITCH: very cardtype specific! */
        switch (si->ps.card_type)
        {
        case MIL1:
                switch (si->fbc.bytes_per_row / (si->engine.depth >> 3))
                {
                        case 640:
                        case 768:
                        case 800:
                        case 960:
                        case 1024:
                        case 1152:
                        case 1280:
                        case 1600:
                        case 1920:
                        case 2048:
                                /* we are using hardware adress linearisation */
                                break;
                        default:
                                /* we are using software adress linearisation */
                                si->engine.y_lin = 0x01;
                                LOG(8,("ACC: using software adress linearisation\n"));
                                break;
                }
                ACCW(PITCH, (si->engine.y_lin << 15) |
                                        ((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x0FFF));
                break;
        case MIL2:
                switch (si->fbc.bytes_per_row / (si->engine.depth >> 3))
                {
                        case 512:
                        case 640:
                        case 768:
                        case 800:
                        case 832:
                        case 960:
                        case 1024:
                        case 1152:
                        case 1280:
                        case 1600:
                        case 1664:
                        case 1920:
                        case 2048:
                                /* we are using hardware adress linearisation */
                                break;
                        default:
                                /* we are using software adress linearisation */
                                si->engine.y_lin = 0x01;
                                LOG(8,("ACC: using software adress linearisation\n"));
                                break;
                }
                ACCW(PITCH, (si->engine.y_lin << 15) |
                                        ((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x0FFF));
                break;
        case G100:
                /* always using hardware adress linearisation, because 2D/3D
                 * engine works on every pitch multiple of 32 */
                ACCW(PITCH, ((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x0FFF));
                break;
        default:
                /* G200 and up are equal.. */
                /* always using hardware adress linearisation, because 2D/3D
                 * engine works on every pitch multiple of 32 */
                ACCW(PITCH, ((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x1FFF));
                break;
        }

        /* disable plane write mask (needed for SDRAM): actual change needed to get it sent to RAM */
        ACCW(PLNWT,0x00000000);
        ACCW(PLNWT,0xffffffff);

        if (si->ps.card_type >= G200) {
                /*DSTORG - location of active screen in framebuffer*/
                ACCW(DSTORG,((uint8*)si->fbc.frame_buffer) - ((uint8*)si->framebuffer));

                /*SRCORG - init source address - same as dest*/
                ACCW(SRCORG,((uint8*)si->fbc.frame_buffer) - ((uint8*)si->framebuffer));
        }

        /* init YDSTORG - apsed, if not inited, BitBlts may fails on <= G200 */
        si->engine.src_dst = 0;
        ACCW(YDSTORG, si->engine.src_dst);

        /* <= G100 uses this register as SRCORG/DSTORG replacement, but
         * MIL 1/2 does not need framebuffer space for the hardcursor! */
        if ((si->ps.card_type == G100) && (si->settings.hardcursor))
        {
                switch (si->dm.space)
                {
                        case B_CMAP8:
                                si->engine.src_dst = 1024 / 1;
                                break;
                        case B_RGB15_LITTLE:
                        case B_RGB16_LITTLE:
                                si->engine.src_dst = 1024 / 2;
                                break;
                        case B_RGB32_LITTLE:
                                si->engine.src_dst =  1024 / 4;
                                break;
                        default:
                                LOG(8,("ACC: G100 hardcursor not supported for current colorspace\n"));
                                return B_ERROR;
                }               
        }
        ACCW(YDSTORG, si->engine.src_dst);

        /* clipping */
        /* i.e. highest and lowest X pixel adresses */
        ACCW(CXBNDRY,(((si->fbc.bytes_per_row / (si->engine.depth >> 3)) - 1) << 16) | (0));

        /* Y pixel addresses must be linear */
        /* lowest adress */
        ACCW(YTOP, 0 + si->engine.src_dst);
        /* highest adress */
        ACCW(YBOT,((si->dm.virtual_height - 1) *
                (si->fbc.bytes_per_row / (si->engine.depth >> 3))) + si->engine.src_dst);

        return B_OK;
}


/*
        note:
        moved acceleration 'top-level' routines to be integrated in the engine:
        it is costly to call the engine for every single function within a loop!
        (measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.)
*/

/* screen to screen blit - i.e. move windows around.
 * Engine function bitblit, paragraph 4.5.7.2 */
void SCREEN_TO_SCREEN_BLIT(engine_token *et, blit_params *list, uint32 count)
{
        uint32 t_start,t_end,offset;
        uint32 b_start,b_end;
        int i = 0;

        /* calc offset 'per line' */
        offset = (si->fbc.bytes_per_row / (si->engine.depth >> 3));

        while (count--)
        {
                /* find where the top and bottom are */
                t_end = t_start =
                        list[i].src_left + (offset * list[i].src_top) + si->engine.src_dst;
                t_end += list[i].width;

                b_end = b_start =
                        list[i].src_left + (offset * (list[i].src_top + list[i].height)) + si->engine.src_dst;
                b_end += list[i].width;

                /* sgnzero bit _must_ be '0' before accessing SGN! */
                ACCW(DWGCTL, 0x00000000);

                /*find which quadrant */
                switch((list[i].dest_top > list[i].src_top) | ((list[i].dest_left > list[i].src_left) << 1))
                {
                case 0: /*L->R,down*/ 
                        ACCW(SGN, 0);
                        ACCW(AR3, t_start);
                        ACCW(AR0, t_end);
                        ACCW(AR5, offset);
                        ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
                        break;
                case 1: /*L->R,up*/
                        ACCW(SGN, 4);
                        ACCW(AR3, b_start);
                        ACCW(AR0, b_end);
                        ACCW(AR5, -offset);
                        ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
                        break;
                case 2: /*R->L,down*/
                        ACCW(SGN, 1);
                        ACCW(AR3, t_end);
                        ACCW(AR0, t_start);
                        ACCW(AR5, offset);
                        ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
                        break;
                case 3: /*R->L,up*/
                        ACCW(SGN, 5);
                        ACCW(AR3, b_end);
                        ACCW(AR0, b_start);
                        ACCW(AR5, -offset);
                        ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
                        break;
                }
                ACCW(FXBNDRY,((list[i].dest_left + list[i].width) << 16) | list[i].dest_left);

                /* start the blit */
                ACCGO(DWGCTL, 0x040c4018); // atype RSTR
                i++;
        }
}

/* screen to screen tranparent blit - not sure what uses this.
 * Engine function bitblit, paragraph 4.5.7.2 */
//WARNING:
//yet untested function!!
void SCREEN_TO_SCREEN_TRANSPARENT_BLIT(engine_token *et, uint32 transparent_colour, blit_params *list, uint32 count)
{
        uint32 t_start,t_end,offset;
        uint32 b_start,b_end;
        int i = 0;

        /* calc offset 'per line' */
        offset = (si->fbc.bytes_per_row / (si->engine.depth >> 3));

        while (count--)
        {
                /* find where the top and bottom are */
                t_end = t_start =
                        list[i].src_left + (offset * list[i].src_top) + si->engine.src_dst;
                t_end += list[i].width;

                b_end = b_start =
                        list[i].src_left + (offset * (list[i].src_top + list[i].height)) + si->engine.src_dst;
                b_end += list[i].width;

                /* sgnzero bit _must_ be '0' before accessing SGN! */
                ACCW(DWGCTL, 0x00000000);

                /*find which quadrant */
                switch((list[i].dest_top > list[i].src_top) | ((list[i].dest_left > list[i].src_left) << 1))
                {
                case 0: /*L->R,down*/ 
                        ACCW(SGN, 0);
                        ACCW(AR3, t_start);
                        ACCW(AR0, t_end);
                        ACCW(AR5, offset);
                        ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
                        break;
                case 1: /*L->R,up*/
                        ACCW(SGN, 4);
                        ACCW(AR3, b_start);
                        ACCW(AR0, b_end);
                        ACCW(AR5, -offset);
                        ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
                        break;
                case 2: /*R->L,down*/
                        ACCW(SGN, 1);
                        ACCW(AR3, t_end);
                        ACCW(AR0, t_start);
                        ACCW(AR5, offset);
                        ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
                        break;
                case 3: /*R->L,up*/
                        ACCW(SGN, 5);
                        ACCW(AR3, b_end);
                        ACCW(AR0, b_start);
                        ACCW(AR5, -offset);
                        ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
                        break;
                }
                ACCW(FXBNDRY,((list[i].dest_left + list[i].width) << 16) | list[i].dest_left);

                /* start the blit */
                ACCW(FCOL, transparent_colour);
                ACCW(BCOL, 0xffffffff);
                ACCGO(DWGCTL, 0x440c4018); // atype RSTR
                i++;
        }
}

/* screen to screen scaled filtered blit - i.e. scale video in memory.
 * Engine function texture mapping for video, paragraphs 4.5.5.5 - 4.5.5.9 */
//fixme: implement...
void SCREEN_TO_SCREEN_SCALED_FILTERED_BLIT(engine_token *et, scaled_blit_params *list, uint32 count)
{
        int i = 0;

        while (count--)
        {
/*
                        list[i].src_left,
                        list[i].src_top,
                        list[i].src_width,
                        list[i].src_height,
                        list[i].dest_left,
                        list[i].dest_top,
                        list[i].dest_width,
                        list[i].dest_height
*/
                i++;
        }
}

/* rectangle fill.
 * Engine function rectangle_fill: paragraph 4.5.5.2 */
void FILL_RECTANGLE(engine_token *et, uint32 colorIndex, fill_rect_params *list, uint32 count)
{
/*
        FXBNDRY - left and right coordinates    a
        YDSTLEN - y start and no of lines       a
        (or YDST and LEN)                       
        DWGCTL - atype must be RSTR or BLK      a
        FCOL - foreground colour                a
*/
        int i = 0;

        while (count--)
        {
                ACCW(FXBNDRY, (((list[i].right + 1) << 16) | list[i].left));
                ACCW_YDSTLEN(list[i].top, ((list[i].bottom - list[i].top) + 1));
                ACCW(FCOL, colorIndex);

                /* start the fill */
//acc fixme: checkout blockmode constraints for G100+ (mil: nc?): also add blockmode
//               for other functions, and use fastblt on MIL1/2 if possible...
//or is CMAP8 contraint a non-blockmode contraint? (linearisation problem maybe?)
                if ((si->dm.space == B_CMAP8) || si->ps.sdram)
                {
                        ACCGO(DWGCTL, 0x400c7814); // atype RSTR
                }
                else
                {
                        ACCGO(DWGCTL, 0x400c7844); // atype BLK 
                }
                i++;
        }
}

/* horizontal span fill.
 * Engine function rectangle_fill: paragraph 4.5.5.2 */
//(uint32 xs,uint32 xe,uint32 ys,uint32 yl,uint32 col)
void FILL_SPAN(engine_token *et, uint32 colorIndex, uint16 *list, uint32 count)
{
/*
        FXBNDRY - left and right coordinates    a
        YDSTLEN - y start and no of lines       a
        (or YDST and LEN)                       
        DWGCTL - atype must be RSTR or BLK      a
        FCOL - foreground colour                a
*/
        int i = 0;

        while (count--)
        {
                ACCW(FXBNDRY, ((list[i + 2] + 1) << 16)| list[i + 1]);
                ACCW_YDSTLEN(list[i], 1);
                ACCW(FCOL, colorIndex);

                /* start the fill */
//acc fixme: checkout blockmode constraints for G100+ (mil: nc?): also add blockmode
//               for other functions, and use fastblt on MIL1/2 if possible...
//or is CMAP8 contraint a non-blockmode contraint? (linearisation problem maybe?)
                if ((si->dm.space == B_CMAP8) || si->ps.sdram)
                {
                        ACCGO(DWGCTL, 0x400c7814); // atype RSTR
                }
                else
                {
                        ACCGO(DWGCTL, 0x400c7844); // atype BLK
                }
                i += 3;
        }
}

/* rectangle invert.
 * Engine function rectangle_fill: paragraph 4.5.5.2 */
void INVERT_RECTANGLE(engine_token *et, fill_rect_params *list, uint32 count)
{
/*
        FXBNDRY - left and right coordinates    a
        YDSTLEN - y start and no of lines       a
        (or YDST and LEN)                       
        DWGCTL - atype must be RSTR or BLK      a
        FCOL - foreground colour                a
*/
        int i = 0;
//      uint32 * dma;
//      uint32 pci;

        while (count--)
        {
                ACCW(FXBNDRY, (((list[i].right) + 1) << 16) | list[i].left);
                ACCW_YDSTLEN(list[i].top, ((list[i].bottom - list[i].top) + 1));
                ACCW(FCOL, 0); /* color */

                /* start the invert (top nibble is c is clipping enabled) */
                ACCGO(DWGCTL, 0x40057814); // atype RSTR

                /* pseudo_dma version! */
//              MGAACC_DWGCTL      =0x1c00,
//              MGAACC_FCOL        =0x1c24,
//              MGAACC_FXBNDRY     =0x1c84,
//              MGAACC_YDSTLEN     =0x1c88,
//
//              40,09,21,22 (ordered as registers)

//              dma = (uint32 *)si->pseudo_dma;
//              *dma++= 0x40092221;
//              *dma++= (((list[i].right) + 1) << 16) | list[i].left;
//              *dma++= (list[i].top << 16) | ((list[i].bottom - list[i].top) + 1);
//              *dma++= 0; /* color */
//              *dma++= 0x40057814;

                /* real dma version! */
//              dma = (vuint32 *)si->dma_buffer;
//              *dma++= 0x40092221; /* indices */
//              *dma++= (((list[i].right) + 1) << 16) | list[i].left;
//              *dma++= (list[i].top << 16) | ((list[i].bottom - list[i].top) + 1);
//              *dma++= 0; /* color */
//              *dma++= 0x40057814;

//              pci = si->dma_buffer_pci;
//              ACCW(PRIMADDRESS, (pci));
//              ACCW(PRIMEND, (20 + pci));

//              delay(100);

                i++;
        }
}