root/src/add-ons/accelerants/nvidia/engine/nv_bes.c
/* Nvidia TNT and GeForce Back End Scaler functions */
/* Written by Rudolf Cornelissen 05/2002-5/2009 */

#define MODULE_BIT 0x00000200

#include "nv_std.h"

typedef struct move_overlay_info move_overlay_info;

struct move_overlay_info
{
        uint32 hcoordv;         /* left and right edges of video output window */
        uint32 vcoordv;         /* top and bottom edges of video output window */
        uint32 hsrcstv;         /* horizontal source start in source buffer (clipping) */
        uint32 v1srcstv;        /* vertical source start in source buffer (clipping) */
        uintptr_t a1orgv;               /* alternate source clipping via startadress of source buffer */
};

static void nv_bes_calc_move_overlay(move_overlay_info *moi);
static void nv_bes_program_move_overlay(move_overlay_info moi);

/* move the overlay output window in virtualscreens */
/* Note:
 * si->dm.h_display_start and si->dm.v_display_start determine where the new
 * output window is located! */
void nv_bes_move_overlay()
{
        move_overlay_info moi;

        /* abort if overlay is not active */
        if (!si->overlay.active) return;

        nv_bes_calc_move_overlay(&moi);
        nv_bes_program_move_overlay(moi);
}

static void nv_bes_calc_move_overlay(move_overlay_info *moi)
{
        /* misc used variables */
        uint16 temp1, temp2;
        /* visible screen window in virtual workspaces */
        uint16 crtc_hstart, crtc_vstart, crtc_hend, crtc_vend;

        /* do 'overlay follow head' in dualhead modes on dualhead cards */
        if (si->ps.secondary_head)
        {
                switch (si->dm.flags & DUALHEAD_BITS)
                {
                case DUALHEAD_ON:
                case DUALHEAD_SWITCH:
                        if ((si->overlay.ow.h_start + (si->overlay.ow.width / 2)) <
                                        (si->dm.h_display_start + si->dm.timing.h_display))
                                nv_bes_to_crtc(si->crtc_switch_mode);
                        else
                                nv_bes_to_crtc(!si->crtc_switch_mode);
                        break;
                default:
                                nv_bes_to_crtc(si->crtc_switch_mode);
                        break;
                }
        }

        /* the BES does not respect virtual_workspaces, but adheres to CRTC
         * constraints only */
        crtc_hstart = si->dm.h_display_start;
        /* make dualhead stretch and switch mode work while we're at it.. */
        if (si->overlay.crtc)
        {
                crtc_hstart += si->dm.timing.h_display;
        }

        /* horizontal end is the first position beyond the displayed range on the CRTC */
        crtc_hend = crtc_hstart + si->dm.timing.h_display;
        crtc_vstart = si->dm.v_display_start;
        /* vertical end is the first position beyond the displayed range on the CRTC */
        crtc_vend = crtc_vstart + si->dm.timing.v_display;


        /****************************************
         *** setup all edges of output window ***
         ****************************************/

        /* setup left and right edges of output window */
        moi->hcoordv = 0;
        /* left edge coordinate of output window, must be inside desktop */
        /* clipping on the left side */
        if (si->overlay.ow.h_start < crtc_hstart)
        {
                temp1 = 0;
        }
        else
        {
                /* clipping on the right side */
                if (si->overlay.ow.h_start >= (crtc_hend - 1))
                {
                        /* width < 2 is not allowed */
                        temp1 = (crtc_hend - crtc_hstart - 2) & 0x7ff;
                } 
                else
                /* no clipping here */
                {
                        temp1 = (si->overlay.ow.h_start - crtc_hstart) & 0x7ff;
                }
        } 
        moi->hcoordv |= temp1 << 16;
        /* right edge coordinate of output window, must be inside desktop */
        /* width < 2 is not allowed */
        if (si->overlay.ow.width < 2) 
        {
                temp2 = (temp1 + 1) & 0x7ff;
        }
        else 
        {
                /* clipping on the right side */
                if ((si->overlay.ow.h_start + si->overlay.ow.width - 1) > (crtc_hend - 1))
                {
                        temp2 = (crtc_hend - crtc_hstart - 1) & 0x7ff;
                }
                else
                {
                        /* clipping on the left side */
                        if ((si->overlay.ow.h_start + si->overlay.ow.width - 1) < (crtc_hstart + 1))
                        {
                                /* width < 2 is not allowed */
                                temp2 = 1;
                        }
                        else
                        /* no clipping here */
                        {
                                temp2 = ((uint16)(si->overlay.ow.h_start + si->overlay.ow.width - crtc_hstart - 1)) & 0x7ff;
                        }
                }
        }
        moi->hcoordv |= temp2 << 0;
        LOG(4,("Overlay: CRTC left-edge output %d, right-edge output %d\n",temp1, temp2));

        /* setup top and bottom edges of output window */
        moi->vcoordv = 0;
        /* top edge coordinate of output window, must be inside desktop */
        /* clipping on the top side */
        if (si->overlay.ow.v_start < crtc_vstart)
        {
                temp1 = 0;
        }
        else
        {
                /* clipping on the bottom side */
                if (si->overlay.ow.v_start >= (crtc_vend - 1))
                {
                        /* height < 2 is not allowed */
                        temp1 = (crtc_vend - crtc_vstart - 2) & 0x7ff;
                } 
                else
                /* no clipping here */
                {
                        temp1 = (si->overlay.ow.v_start - crtc_vstart) & 0x7ff;
                }
        } 
        moi->vcoordv |= temp1 << 16;
        /* bottom edge coordinate of output window, must be inside desktop */
        /* height < 2 is not allowed */
        if (si->overlay.ow.height < 2) 
        {
                temp2 = (temp1 + 1) & 0x7ff;
        }
        else 
        {
                /* clipping on the bottom side */
                if ((si->overlay.ow.v_start + si->overlay.ow.height - 1) > (crtc_vend - 1))
                {
                        temp2 = (crtc_vend - crtc_vstart - 1) & 0x7ff;
                }
                else
                {
                        /* clipping on the top side */
                        if ((si->overlay.ow.v_start + si->overlay.ow.height - 1) < (crtc_vstart + 1))
                        {
                                /* height < 2 is not allowed */
                                temp2 = 1;
                        }
                        else
                        /* no clipping here */
                        {
                                temp2 = ((uint16)(si->overlay.ow.v_start + si->overlay.ow.height - crtc_vstart - 1)) & 0x7ff;
                        }
                }
        }
        moi->vcoordv |= temp2 << 0;
        LOG(4,("Overlay: CRTC top-edge output %d, bottom-edge output %d\n",temp1, temp2));


        /*********************************
         *** setup horizontal clipping ***
         *********************************/

        /* Setup horizontal source start: first (sub)pixel contributing to output picture */
        /* Note:
         * The method is to calculate, based on 1:1 scaling, based on the output window.
         * After this is done, include the scaling factor so you get a value based on the input bitmap.
         * Then add the left starting position of the bitmap's view (zoom function) to get the final value needed.
         * Note: The input bitmaps slopspace is automatically excluded from the calculations this way! */
        /* Note also:
         * Even if the scaling factor is clamping we instruct the BES to use the correct source start pos.! */
        moi->hsrcstv = 0;
        /* check for destination horizontal clipping at left side */
        if (si->overlay.ow.h_start < crtc_hstart)
        {
                /* check if entire destination picture is clipping left:
                 * (2 pixels will be clamped onscreen at least) */
                if ((si->overlay.ow.h_start + si->overlay.ow.width - 1) < (crtc_hstart + 1))
                {
                        /* increase 'first contributing pixel' with 'fixed value': (total dest. width - 2) */
                        moi->hsrcstv += (si->overlay.ow.width - 2);
                }
                else
                {
                        /* increase 'first contributing pixel' with actual number of dest. clipping pixels */
                        moi->hsrcstv += (crtc_hstart - si->overlay.ow.h_start);
                }
                LOG(4,("Overlay: clipping left...\n"));

                /* The calculated value is based on scaling = 1x. So we now compensate for scaling.
                 * Note that this also already takes care of aligning the value to the BES register! */
                moi->hsrcstv *= si->overlay.h_ifactor;
        }
        /* take zoom into account */
        moi->hsrcstv += ((uint32)si->overlay.my_ov.h_start) << 16;
        /* AND below required by hardware (> 1024 support confirmed on all cards) */
        moi->hsrcstv &= 0x07fffffc;
        LOG(4,("Overlay: first hor. (sub)pixel of input bitmap contributing %f\n", moi->hsrcstv / (float)65536));


        /*******************************
         *** setup vertical clipping ***
         *******************************/

        /* calculate inputbitmap origin adress */
        moi->a1orgv = (uintptr_t)((vuint32 *)si->overlay.ob.buffer);
        moi->a1orgv -= (uintptr_t)((vuint32 *)si->framebuffer);
        LOG(4, ("Overlay: topleft corner of input bitmap (cardRAM offset) $%08x\n", moi->a1orgv));

        /* Setup vertical source start: first (sub)pixel contributing to output picture. */
        /* Note:
         * The method is to calculate, based on 1:1 scaling, based on the output window.
         * 'After' this is done, include the scaling factor so you get a value based on the input bitmap. 
         * Then add the top starting position of the bitmap's view (zoom function) to get the final value needed. */
        /* Note also:
         * Even if the scaling factor is clamping we instruct the BES to use the correct source start pos.! */

        moi->v1srcstv = 0;
        /* check for destination vertical clipping at top side */
        if (si->overlay.ow.v_start < crtc_vstart)
        {
                /* check if entire destination picture is clipping at top:
                 * (2 pixels will be clamped onscreen at least) */
                if ((si->overlay.ow.v_start + si->overlay.ow.height - 1) < (crtc_vstart + 1))
                {
                        /* increase 'number of clipping pixels' with 'fixed value':
                         * 'total height - 2' of dest. picture in pixels * inverse scaling factor */
                        moi->v1srcstv = (si->overlay.ow.height - 2) * si->overlay.v_ifactor;
                        /* on pre-NV10 we need to do clipping in the source
                         * bitmap because no seperate clipping registers exist... */
                        if (si->ps.card_arch < NV10A)
                                moi->a1orgv += ((moi->v1srcstv >> 16) * si->overlay.ob.bytes_per_row);
                }
                else
                {
                        /* increase 'first contributing pixel' with:
                         * number of destination picture clipping pixels * inverse scaling factor */
                        moi->v1srcstv = (crtc_vstart - si->overlay.ow.v_start) * si->overlay.v_ifactor;
                        /* on pre-NV10 we need to do clipping in the source
                         * bitmap because no seperate clipping registers exist... */
                        if (si->ps.card_arch < NV10A)
                                moi->a1orgv += ((moi->v1srcstv >> 16) * si->overlay.ob.bytes_per_row);
                }
                LOG(4,("Overlay: clipping at top...\n"));
        }
        /* take zoom into account */
        moi->v1srcstv += (((uint32)si->overlay.my_ov.v_start) << 16);
        if (si->ps.card_arch < NV10A)
        {
                moi->a1orgv += (si->overlay.my_ov.v_start * si->overlay.ob.bytes_per_row);
                LOG(4,("Overlay: 'contributing part of buffer' origin is (cardRAM offset) $%08x\n", moi->a1orgv));
        }
        LOG(4,("Overlay: first vert. (sub)pixel of input bitmap contributing %f\n", moi->v1srcstv / (float)65536));

        /* AND below is probably required by hardware. */
        /* Buffer A topleft corner of field 1 (origin)(field 1 contains our full frames) */
        moi->a1orgv &= 0xfffffff0;
}

static void nv_bes_program_move_overlay(move_overlay_info moi)
{
        /*************************************
         *** sync to BES (Back End Scaler) ***
         *************************************/

        /* Done in card hardware:
         * double buffered registers + trigger if programming complete feature. */


        /**************************************
         *** actually program the registers ***
         **************************************/

        if (si->ps.card_arch < NV10A)
        {
                /* unknown, but needed (otherwise high-res distortions and only half the frames */
                BESW(NV04_OE_STATE, 0x00000000);
                /* select buffer 0 as active (b16) */
                BESW(NV04_SU_STATE, 0x00000000);
                /* unknown (no effect?) */
                BESW(NV04_RM_STATE, 0x00000000);
                /* setup clipped(!) buffer startadress in RAM */
                /* RIVA128 - TNT bes doesn't have clipping registers, so no subpixelprecise clipping
                 * either. We do pixelprecise vertical and 'two pixel' precise horizontal clipping here. */
                /* (program both buffers to prevent sync distortions) */
                /* first include 'pixel precise' left clipping... (top clipping was already included) */
                moi.a1orgv += ((moi.hsrcstv >> 16) * 2);
                /* we need to step in 4-byte (2 pixel) granularity due to the nature of yuy2 */
                BESW(NV04_0BUFADR, (moi.a1orgv & ~0x03));
                BESW(NV04_1BUFADR, (moi.a1orgv & ~0x03));
                /* setup output window position */
                BESW(NV04_DSTREF, ((moi.vcoordv & 0xffff0000) | ((moi.hcoordv & 0xffff0000) >> 16)));
                /* setup output window size */
                BESW(NV04_DSTSIZE, (
                        (((moi.vcoordv & 0x0000ffff) - ((moi.vcoordv & 0xffff0000) >> 16) + 1) << 16) |
                        ((moi.hcoordv & 0x0000ffff) - ((moi.hcoordv & 0xffff0000) >> 16) + 1)
                        ));
                /* select buffer 1 as active (b16) */
                BESW(NV04_SU_STATE, 0x00010000);
        }
        else
        {
                /* >= NV10A */
        
                /* setup buffer origin: GeForce uses subpixel precise clipping on left and top! (12.4 values) */
                BESW(NV10_0SRCREF, ((moi.v1srcstv << 4) & 0xffff0000) | ((moi.hsrcstv >> 12) & 0x0000ffff));
                /* setup output window position */
                BESW(NV10_0DSTREF, ((moi.vcoordv & 0xffff0000) | ((moi.hcoordv & 0xffff0000) >> 16)));
                /* setup output window size */
                BESW(NV10_0DSTSIZE, (
                        (((moi.vcoordv & 0x0000ffff) - ((moi.vcoordv & 0xffff0000) >> 16) + 1) << 16) |
                        ((moi.hcoordv & 0x0000ffff) - ((moi.hcoordv & 0xffff0000) >> 16) + 1)
                        ));
                /* We only use buffer buffer 0: select it. (0x01 = buffer 0, 0x10 = buffer 1) */
                /* This also triggers activation of programmed values (double buffered registers feature) */
                BESW(NV10_BUFSEL, 0x00000001);
        }
}

status_t nv_bes_to_crtc(bool crtc)
{
        if (si->ps.secondary_head)
        {
                if (crtc)
                {
                        LOG(4,("Overlay: switching overlay to CRTC2\n"));
                        /* switch overlay engine to CRTC2 */
                        NV_REG32(NV32_FUNCSEL) &= ~0x00001000;
                        NV_REG32(NV32_2FUNCSEL) |= 0x00001000;
                        si->overlay.crtc = !si->crtc_switch_mode;
                }
                else
                {
                        LOG(4,("Overlay: switching overlay to CRTC1\n"));
                        /* switch overlay engine to CRTC1 */
                        NV_REG32(NV32_2FUNCSEL) &= ~0x00001000;
                        NV_REG32(NV32_FUNCSEL) |= 0x00001000;
                        si->overlay.crtc = si->crtc_switch_mode;
                }
                return B_OK;
        }
        else
        {
                return B_ERROR;
        }
}

status_t nv_bes_init()
{
        if (si->ps.card_arch < NV10A)
        {
                /* disable overlay ints (b0 = buffer 0, b4 = buffer 1) */
                BESW(NV04_INTE, 0x00000000);

                /* setup saturation to be 'neutral' */
                BESW(NV04_SAT, 0x00000000);
                /* setup RGB brightness to be 'neutral' */
                BESW(NV04_RED_AMP, 0x00000069);
                BESW(NV04_GRN_AMP, 0x0000003e);
                BESW(NV04_BLU_AMP, 0x00000089);

                /* setup fifo for fetching data */
                BESW(NV04_FIFOBURL, 0x00000003);
                BESW(NV04_FIFOTHRS, 0x00000038);

                /* unknown, but needed (registers only have b0 implemented) */
                /* (program both buffers to prevent sync distortions) */
                BESW(NV04_0OFFSET, 0x00000000);
                BESW(NV04_1OFFSET, 0x00000000);
        }
        else
        {
                /* >= NV10A */

                /* disable overlay ints (b0 = buffer 0, b4 = buffer 1) */
                BESW(NV10_INTE, 0x00000000);
                /* shut off GeForce4MX MPEG2 decoder */
                BESW(DEC_GENCTRL, 0x00000000);
                /* setup BES memory-range mask */
                BESW(NV10_0MEMMASK, (si->ps.memory_size - 1));
                /* unknown, but needed */
                BESW(NV10_0OFFSET, 0x00000000);

                /* setup brightness, contrast and saturation to be 'neutral' */
                BESW(NV10_0BRICON, ((0x1000 << 16) | 0x1000));
                BESW(NV10_0SAT, ((0x0000 << 16) | 0x1000));
        }

        /* make sure the engine is disabled. */
        nv_release_bes();

        return B_OK;
}

status_t nv_configure_bes
        (const overlay_buffer *ob, const overlay_window *ow, const overlay_view *ov, int offset)
{
        /* yuy2 (4:2:2) colorspace calculations */

        /* Note:
         * in BeOS R5.0.3 and DANO:
         * 'ow->offset_xxx' is always 0, so not used;
         * 'ow->width' and 'ow->height' are the output window size: does not change
         * if window is clipping;
         * 'ow->h_start' and 'ow->v_start' are the left-top position of the output
         * window. These values can be negative: this means the window is clipping
         * at the left or the top of the display, respectively. */

        /* 'ov' is the view in the source bitmap, so which part of the bitmap is actually
         * displayed on screen. This is used for the 'hardware zoom' function. */
 
        /* output window position and clipping info for source buffer */
        move_overlay_info moi;
        /* calculated BES register values */
        uint32  hiscalv, viscalv;
        /* interval representation, used for scaling calculations */
        uint16 intrep;
        /* inverse scaling factor, used for source positioning */
        uint32 ifactor;
        /* copy of overlay view which has checked valid values */
        overlay_view my_ov;


        /**************************************************************************************
         *** copy, check and limit if needed the user-specified view into the intput bitmap ***
         **************************************************************************************/
        my_ov = *ov;
        /* check for valid 'coordinates' */
        if (my_ov.width == 0) my_ov.width++;
        if (my_ov.height == 0) my_ov.height++;
        if (my_ov.h_start > ((ob->width - si->overlay.myBufInfo[offset].slopspace) - 1))
                my_ov.h_start = ((ob->width - si->overlay.myBufInfo[offset].slopspace) - 1);
        if (((my_ov.h_start + my_ov.width) - 1) > ((ob->width - si->overlay.myBufInfo[offset].slopspace) - 1))
                my_ov.width = ((((ob->width - si->overlay.myBufInfo[offset].slopspace) - 1) - my_ov.h_start) + 1);
        if (my_ov.v_start > (ob->height - 1))
                my_ov.v_start = (ob->height - 1);
        if (((my_ov.v_start + my_ov.height) - 1) > (ob->height - 1))
                my_ov.height = (((ob->height - 1) - my_ov.v_start) + 1);

        LOG(4,("Overlay: inputbuffer view (zoom) left %d, top %d, width %d, height %d\n",
                my_ov.h_start, my_ov.v_start, my_ov.width, my_ov.height));

        /* save for nv_bes_calc_move_overlay() */
        si->overlay.ow = *ow;
        si->overlay.ob = *ob;
        si->overlay.my_ov = my_ov;


        /********************************
         *** setup horizontal scaling ***
         ********************************/
        LOG(4,("Overlay: total input picture width = %d, height = %d\n",
                        (ob->width - si->overlay.myBufInfo[offset].slopspace), ob->height));
        LOG(4,("Overlay: output picture width = %d, height = %d\n", ow->width, ow->height));

        /* determine interval representation value, taking zoom into account */
        if (ow->flags & B_OVERLAY_HORIZONTAL_FILTERING)
        {
                /* horizontal filtering is ON */
                if ((my_ov.width == ow->width) | (ow->width < 2))
                {
                        /* no horizontal scaling used, OR destination width < 2 */
                        intrep = 0;
                }
                else
                {
                        intrep = 1;
                }
        }
        else
        {
                /* horizontal filtering is OFF */
                if ((ow->width < my_ov.width) & (ow->width >= 2))
                {
                        /* horizontal downscaling used AND destination width >= 2 */
                        intrep = 1;
                }
                else
                {
                        intrep = 0;
                }
        }
        LOG(4,("Overlay: horizontal interval representation value is %d\n",intrep));

        /* calculate inverse horizontal scaling factor, taking zoom into account */
        /* standard scaling formula: */
        ifactor = (((uint32)(my_ov.width - intrep)) << 16) / (ow->width - intrep); 

        /* correct factor to prevent most-right visible 'line' from distorting */
        ifactor -= (1 << 2);
        hiscalv = ifactor;
        /* save for nv_bes_calc_move_overlay() */
        si->overlay.h_ifactor = ifactor;
        LOG(4,("Overlay: horizontal scaling factor is %f\n", (float)65536 / ifactor));

        /* check scaling factor (and modify if needed) to be within scaling limits */
        /* all cards have a upscaling limit of 8.0 (see official nVidia specsheets) */
        if (hiscalv < 0x00002000)
        {
                /* (non-inverse) factor too large, set factor to max. valid value */
                hiscalv = 0x00002000;
                LOG(4,("Overlay: horizontal scaling factor too large, clamping at %f\n", (float)65536 / hiscalv));
        }
        switch (si->ps.card_arch)
        {
        case NV04A:
                /* Riva128-TNT2 series have a 'downscaling' limit of 1.000489
                 * (16bit register with 0.11 format value) */
                if (hiscalv > 0x0000ffff)
                {
                        /* (non-inverse) factor too small, set factor to min. valid value */
                        hiscalv = 0x0000ffff;
                        LOG(4,("Overlay: horizontal scaling factor too small, clamping at %f\n", (float)2048 / (hiscalv >> 5)));
                }
                break;
        case NV30A:
        case NV40A:
                /* GeForceFX series and up have a downscaling limit of 0.5 (except NV31!) */
                if ((hiscalv > (2 << 16)) && (si->ps.card_type != NV31))
                {
                        /* (non-inverse) factor too small, set factor to min. valid value */
                        hiscalv = (2 << 16);
                        LOG(4,("Overlay: horizontal scaling factor too small, clamping at %f\n", (float)65536 / hiscalv));
                }
                /* NV31 (confirmed GeForceFX 5600) has NV20A scaling limits!
                 * So let it fall through... */
                if (si->ps.card_type != NV31) break;
        default:
                /* the rest has a downscaling limit of 0.125 */
                if (hiscalv > (8 << 16))
                {
                        /* (non-inverse) factor too small, set factor to min. valid value */
                        hiscalv = (8 << 16);
                        LOG(4,("Overlay: horizontal scaling factor too small, clamping at %f\n", (float)65536 / hiscalv));
                }
                break;
        }
        /* AND below is required by hardware */
        hiscalv &= 0x001ffffc;


        /******************************
         *** setup vertical scaling ***
         ******************************/

        /* determine interval representation value, taking zoom into account */
        if (ow->flags & B_OVERLAY_VERTICAL_FILTERING)
        {
                /* vertical filtering is ON */
                if ((my_ov.height == ow->height) | (ow->height < 2))
                {
                        /* no vertical scaling used, OR destination height < 2 */
                        intrep = 0;
                }
                else
                {
                        intrep = 1;
                }
        }
        else
        {
                /* vertical filtering is OFF */
                if ((ow->height < my_ov.height) & (ow->height >= 2))
                {
                        /* vertical downscaling used AND destination height >= 2 */
                        intrep = 1;
                }
                else
                {
                        intrep = 0;
                }
        }
        LOG(4,("Overlay: vertical interval representation value is %d\n",intrep));

        /* calculate inverse vertical scaling factor, taking zoom into account */
        /* standard scaling formula: */
        ifactor = (((uint32)(my_ov.height - intrep)) << 16) / (ow->height - intrep); 

        /* correct factor to prevent lowest visible line from distorting */
        ifactor -= (1 << 2);
        LOG(4,("Overlay: vertical scaling factor is %f\n", (float)65536 / ifactor));

        /* preserve ifactor for source positioning calculations later on */
        viscalv = ifactor;
        /* save for nv_bes_calc_move_overlay() */
        si->overlay.v_ifactor = ifactor;

        /* check scaling factor (and modify if needed) to be within scaling limits */
        /* all cards have a upscaling limit of 8.0 (see official nVidia specsheets) */
        if (viscalv < 0x00002000)
        {
                /* (non-inverse) factor too large, set factor to max. valid value */
                viscalv = 0x00002000;
                LOG(4,("Overlay: vertical scaling factor too large, clamping at %f\n", (float)65536 / viscalv));
        }
        switch (si->ps.card_arch)
        {
        case NV04A:
                /* Riva128-TNT2 series have a 'downscaling' limit of 1.000489
                 * (16bit register with 0.11 format value) */
                if (viscalv > 0x0000ffff)
                {
                        /* (non-inverse) factor too small, set factor to min. valid value */
                        viscalv = 0x0000ffff;
                        LOG(4,("Overlay: vertical scaling factor too small, clamping at %f\n", (float)2048 / (viscalv >> 5)));
                }
                break;
        case NV30A:
        case NV40A:
                /* GeForceFX series and up have a downscaling limit of 0.5 (except NV31!) */
                if ((viscalv > (2 << 16)) && (si->ps.card_type != NV31))
                {
                        /* (non-inverse) factor too small, set factor to min. valid value */
                        viscalv = (2 << 16);
                        LOG(4,("Overlay: vertical scaling factor too small, clamping at %f\n", (float)65536 / viscalv));
                }
                /* NV31 (confirmed GeForceFX 5600) has NV20A scaling limits!
                 * So let it fall through... */
                if (si->ps.card_type != NV31) break;
        default:
                /* the rest has a downscaling limit of 0.125 */
                if (viscalv > (8 << 16))
                {
                        /* (non-inverse) factor too small, set factor to min. valid value */
                        viscalv = (8 << 16);
                        LOG(4,("Overlay: vertical scaling factor too small, clamping at %f\n", (float)65536 / viscalv));
                }
                break;
        }
        /* AND below is required by hardware */
        viscalv &= 0x001ffffc;


        /********************************************************************************
         *** setup all edges of output window, setup horizontal and vertical clipping ***
         ********************************************************************************/
        nv_bes_calc_move_overlay(&moi);


        /*****************************
         *** log color keying info ***
         *****************************/

        LOG(4,("Overlay: key_red %d, key_green %d, key_blue %d, key_alpha %d\n",
                ow->red.value, ow->green.value, ow->blue.value, ow->alpha.value));
        LOG(4,("Overlay: mask_red %d, mask_green %d, mask_blue %d, mask_alpha %d\n",
                ow->red.mask, ow->green.mask, ow->blue.mask, ow->alpha.mask));


        /*****************
         *** log flags ***
         *****************/

        LOG(4,("Overlay: ow->flags is $%08x\n",ow->flags));
        /* BTW: horizontal and vertical filtering are fixed and turned on for GeForce overlay. */


        /*************************************
         *** sync to BES (Back End Scaler) ***
         *************************************/

        /* Done in card hardware:
         * double buffered registers + trigger if programming complete feature. */


        /**************************************
         *** actually program the registers ***
         **************************************/

        if (si->ps.card_arch < NV10A)
        {
                /* unknown, but needed (otherwise high-res distortions and only half the frames */
                BESW(NV04_OE_STATE, 0x00000000);
                /* select buffer 0 as active (b16) */
                BESW(NV04_SU_STATE, 0x00000000);
                /* unknown (no effect?) */
                BESW(NV04_RM_STATE, 0x00000000);
                /* setup clipped(!) buffer startadress in RAM */
                /* RIVA128 - TNT bes doesn't have clipping registers, so no subpixelprecise clipping
                 * either. We do pixelprecise vertical and 'two pixel' precise horizontal clipping here. */
                /* (program both buffers to prevent sync distortions) */
                /* first include 'pixel precise' left clipping... (top clipping was already included) */
                moi.a1orgv += ((moi.hsrcstv >> 16) * 2);
                /* we need to step in 4-byte (2 pixel) granularity due to the nature of yuy2 */
                BESW(NV04_0BUFADR, (moi.a1orgv & ~0x03));
                BESW(NV04_1BUFADR, (moi.a1orgv & ~0x03));
                /* setup buffer source pitch including slopspace (in bytes).
                 * Note:
                 * source pitch granularity = 16 pixels on the RIVA128 - TNT (so pre-NV10) bes */
                /* (program both buffers to prevent sync distortions) */
                BESW(NV04_0SRCPTCH, (ob->width * 2));
                BESW(NV04_1SRCPTCH, (ob->width * 2));
                /* setup output window position */
                BESW(NV04_DSTREF, ((moi.vcoordv & 0xffff0000) | ((moi.hcoordv & 0xffff0000) >> 16)));
                /* setup output window size */
                BESW(NV04_DSTSIZE, (
                        (((moi.vcoordv & 0x0000ffff) - ((moi.vcoordv & 0xffff0000) >> 16) + 1) << 16) |
                        ((moi.hcoordv & 0x0000ffff) - ((moi.hcoordv & 0xffff0000) >> 16) + 1)
                        ));
                /* setup horizontal and vertical scaling */
                BESW(NV04_ISCALVH, (((viscalv << 16) >> 5) | (hiscalv >> 5)));
                /* enable vertical filtering (b0) */
                BESW(NV04_CTRL_V, 0x00000001);
                /* enable horizontal filtering (no effect?) */
                BESW(NV04_CTRL_H, 0x00000111);
                /* enable BES (b0), set colorkeying (b4), format yuy2 (b8: 0 = ccir) */
                if (ow->flags & B_OVERLAY_COLOR_KEY)
                        BESW(NV04_GENCTRL, 0x00000111);
                else
                        BESW(NV04_GENCTRL, 0x00000101);
                /* select buffer 1 as active (b16) */
                BESW(NV04_SU_STATE, 0x00010000);

                /**************************
                 *** setup color keying ***
                 **************************/

                /* setup colorkeying */
                switch(si->dm.space)
                {
                case B_RGB15_LITTLE:
                        BESW(NV04_COLKEY, (
                                ((ow->blue.value & ow->blue.mask) << 0)   |
                                ((ow->green.value & ow->green.mask) << 5) |
                                ((ow->red.value & ow->red.mask) << 10)    |
                                ((ow->alpha.value & ow->alpha.mask) << 15)
                                ));
                        break;
                case B_RGB16_LITTLE:
                        BESW(NV04_COLKEY, (
                                ((ow->blue.value & ow->blue.mask) << 0)   |
                                ((ow->green.value & ow->green.mask) << 5) |
                                ((ow->red.value & ow->red.mask) << 11)
                                /* this space has no alpha bits */
                                ));
                        break;
                case B_CMAP8:
                case B_RGB32_LITTLE:
                default:
                        BESW(NV04_COLKEY, (
                                ((ow->blue.value & ow->blue.mask) << 0)   |
                                ((ow->green.value & ow->green.mask) << 8) |
                                ((ow->red.value & ow->red.mask) << 16)    |
                                ((ow->alpha.value & ow->alpha.mask) << 24)
                                ));
                        break;
                }
        }
        else
        {
                /* >= NV10A */
        
                /* setup buffer origin: GeForce uses subpixel precise clipping on left and top! (12.4 values) */
                BESW(NV10_0SRCREF, ((moi.v1srcstv << 4) & 0xffff0000) | ((moi.hsrcstv >> 12) & 0x0000ffff));
                /* setup buffersize */
                //fixme if needed: width must be even officially...
                BESW(NV10_0SRCSIZE, ((ob->height << 16) | ob->width));
                /* setup source pitch including slopspace (in bytes),
                 * b16: select YUY2 (0 = YV12), b20: set colorkeying, b24: no iturbt_709 (do iturbt_601) */
                /* Note:
                 * source pitch granularity = 32 pixels on GeForce cards!! */
                if (ow->flags & B_OVERLAY_COLOR_KEY)
                        BESW(NV10_0SRCPTCH, (((ob->width * 2) & 0x0000ffff) | (1 << 16) | (1 << 20) | (0 << 24)));
                else
                        BESW(NV10_0SRCPTCH, (((ob->width * 2) & 0x0000ffff) | (1 << 16) | (0 << 20) | (0 << 24)));
                /* setup output window position */
                BESW(NV10_0DSTREF, ((moi.vcoordv & 0xffff0000) | ((moi.hcoordv & 0xffff0000) >> 16)));
                /* setup output window size */
                BESW(NV10_0DSTSIZE, (
                        (((moi.vcoordv & 0x0000ffff) - ((moi.vcoordv & 0xffff0000) >> 16) + 1) << 16) |
                        ((moi.hcoordv & 0x0000ffff) - ((moi.hcoordv & 0xffff0000) >> 16) + 1)
                        ));
                /* setup horizontal scaling */
                BESW(NV10_0ISCALH, (hiscalv << 4));
                /* setup vertical scaling */
                BESW(NV10_0ISCALV, (viscalv << 4));
                /* setup (unclipped!) buffer startadress in RAM */
                BESW(NV10_0BUFADR, moi.a1orgv);
                /* enable BES (b0 = 0) */
                BESW(NV10_GENCTRL, 0x00000000);
                /* We only use buffer buffer 0: select it. (0x01 = buffer 0, 0x10 = buffer 1) */
                /* This also triggers activation of programmed values (double buffered registers feature) */
                BESW(NV10_BUFSEL, 0x00000001);

                /**************************
                 *** setup color keying ***
                 **************************/

                /* setup colorkeying */
                switch(si->dm.space)
                {
                case B_RGB15_LITTLE:
                        BESW(NV10_COLKEY, (
                                ((ow->blue.value & ow->blue.mask) << 0)   |
                                ((ow->green.value & ow->green.mask) << 5) |
                                ((ow->red.value & ow->red.mask) << 10)    |
                                ((ow->alpha.value & ow->alpha.mask) << 15)
                                ));
                        break;
                case B_RGB16_LITTLE:
                        BESW(NV10_COLKEY, (
                                ((ow->blue.value & ow->blue.mask) << 0)   |
                                ((ow->green.value & ow->green.mask) << 5) |
                                ((ow->red.value & ow->red.mask) << 11)
                                /* this space has no alpha bits */
                                ));
                        break;
                case B_CMAP8:
                case B_RGB32_LITTLE:
                default:
                        BESW(NV10_COLKEY, (
                                ((ow->blue.value & ow->blue.mask) << 0)   |
                                ((ow->green.value & ow->green.mask) << 8) |
                                ((ow->red.value & ow->red.mask) << 16)    |
                                ((ow->alpha.value & ow->alpha.mask) << 24)
                                ));
                        break;
                }
        }

        /* note that overlay is in use (for nv_bes_move_overlay()) */
        si->overlay.active = true;

        return B_OK;
}

status_t nv_release_bes()
{
        if (si->ps.card_arch < NV10A)
        {
                /* setup BES control: disable scaler (b0 = 0) */
                BESW(NV04_GENCTRL, 0x00000000);
        }
        else
        {
                /* setup BES control: disable scaler (b0 = 1) */
                BESW(NV10_GENCTRL, 0x00000001);  
        }

        /* note that overlay is not in use (for nv_bes_move_overlay()) */
        si->overlay.active = false;

        return B_OK;
}