root/src/add-ons/accelerants/neomagic/engine/nm_bes.c
/* NeoMagic Back End Scaler functions */
/* Written by Rudolf Cornelissen 05/2002-1/2006 */

#define MODULE_BIT 0x00000200

#include "nm_std.h"

static void nm_bes_calc_move_overlay(move_overlay_info *moi);
static void nm_bes_program_move_overlay(move_overlay_info moi);

/* move the overlay output window in virtualscreens */
/* Note:
 * si->dm.h_display_start and si->dm.v_display_start determine where the new
 * output window is located! */
void nm_bes_move_overlay()
{
        move_overlay_info moi;

        /* abort if overlay is not active */
        if (!si->overlay.active) return;

        nm_bes_calc_move_overlay(&moi);
        nm_bes_program_move_overlay(moi);
}

static void nm_bes_calc_move_overlay(move_overlay_info *moi)
{
        /* misc used variables */
        uint16 temp1, temp2;
        /* visible screen window in virtual workspaces */
        uint16 crtc_hstart, crtc_vstart, crtc_hend, crtc_vend;
        /* horizontal source start in source buffer (clipping) */
        uint32 hsrcstv;

        /* the BES does not respect virtual_workspaces, but adheres to CRTC
         * constraints only */
        crtc_hstart = si->dm.h_display_start;
        /* horizontal end is the first position beyond the displayed range on the CRTC */
        crtc_hend = crtc_hstart + si->dm.timing.h_display;
        crtc_vstart = si->dm.v_display_start;
        /* vertical end is the first position beyond the displayed range on the CRTC */
        crtc_vend = crtc_vstart + si->dm.timing.v_display;


        /****************************************
         *** setup all edges of output window ***
         ****************************************/

        /* setup left and right edges of output window */
        moi->hcoordv = 0;
        /* left edge coordinate of output window, must be inside desktop */
        /* clipping on the left side */
        if (si->overlay.ow.h_start < crtc_hstart)
        {
                temp1 = 0;
        }
        else
        {
                /* clipping on the right side */
                if (si->overlay.ow.h_start >= (crtc_hend - 1))
                {
                        /* width < 2 is not allowed */
                        temp1 = (crtc_hend - crtc_hstart - 2);
                } 
                else
                /* no clipping here */
                {
                        temp1 = (si->overlay.ow.h_start - crtc_hstart);
                }
        } 
        moi->hcoordv |= temp1 << 16;

        /* right edge coordinate of output window, must be inside desktop */
        /* width < 2 is not allowed */
        if (si->overlay.ow.width < 2) 
        {
                temp2 = (temp1 + 1);
        }
        else 
        {
                /* clipping on the right side */
                if ((si->overlay.ow.h_start + si->overlay.ow.width - 1) > (crtc_hend - 1))
                {
                        temp2 = (crtc_hend - crtc_hstart - 1);
                }
                else
                {
                        /* clipping on the left side */
                        if ((si->overlay.ow.h_start + si->overlay.ow.width - 1) < (crtc_hstart + 1))
                        {
                                /* width < 2 is not allowed */
                                temp2 = 1;
                        }
                        else
                        /* no clipping here */
                        {
                                temp2 = ((uint16)(si->overlay.ow.h_start + si->overlay.ow.width - crtc_hstart - 1));
                        }
                }
        }
        moi->hcoordv |= temp2 << 0;
        LOG(4,("Overlay: CRTC left-edge output %d, right-edge output %d\n",temp1, temp2));

        /* setup top and bottom edges of output window */
        moi->vcoordv = 0;
        /* top edge coordinate of output window, must be inside desktop */
        /* clipping on the top side */
        if (si->overlay.ow.v_start < crtc_vstart)
        {
                temp1 = 0;
        }
        else
        {
                /* clipping on the bottom side */
                if (si->overlay.ow.v_start >= (crtc_vend - 1))
                {
                        /* height < 2 is not allowed */
                        temp1 = (crtc_vend - crtc_vstart - 2);
                } 
                else
                /* no clipping here */
                {
                        temp1 = (si->overlay.ow.v_start - crtc_vstart);
                }
        } 
        moi->vcoordv |= temp1 << 16;

        /* bottom edge coordinate of output window, must be inside desktop */
        /* height < 2 is not allowed */
        if (si->overlay.ow.height < 2) 
        {
                temp2 = (temp1 + 1);
        }
        else 
        {
                /* clipping on the bottom side */
                if ((si->overlay.ow.v_start + si->overlay.ow.height - 1) > (crtc_vend - 1))
                {
                        temp2 = (crtc_vend - crtc_vstart - 1);
                }
                else
                {
                        /* clipping on the top side */
                        if ((si->overlay.ow.v_start + si->overlay.ow.height - 1) < (crtc_vstart + 1))
                        {
                                /* height < 2 is not allowed */
                                temp2 = 1;
                        }
                        else
                        /* no clipping here */
                        {
                                temp2 = ((uint16)(si->overlay.ow.v_start + si->overlay.ow.height - crtc_vstart - 1));
                        }
                }
        }
        moi->vcoordv |= temp2 << 0;
        LOG(4,("Overlay: CRTC top-edge output %d, bottom-edge output %d\n",temp1, temp2));


        /*********************************
         *** setup horizontal clipping ***
         *********************************/

        /* Setup horizontal source start: first (sub)pixel contributing to output picture */
        /* Note:
         * The method is to calculate, based on 1:1 scaling, based on the output window.
         * After this is done, include the scaling factor so you get a value based on the input bitmap.
         * Then add the left starting position of the bitmap's view (zoom function) to get the final value needed.
         * Note: The input bitmaps slopspace is automatically excluded from the calculations this way! */
        /* Note also:
         * Even if the scaling factor is clamping we instruct the BES to use the correct source start pos.! */
        hsrcstv = 0;
        /* check for destination horizontal clipping at left side */
        if (si->overlay.ow.h_start < crtc_hstart)
        {
                /* check if entire destination picture is clipping left:
                 * (2 pixels will be clamped onscreen at least) */
                if ((si->overlay.ow.h_start + si->overlay.ow.width - 1) < (crtc_hstart + 1))
                {
                        /* increase 'first contributing pixel' with 'fixed value': (total dest. width - 2) */
                        hsrcstv += (si->overlay.ow.width - 2);
                }
                else
                {
                        /* increase 'first contributing pixel' with actual number of dest. clipping pixels */
                        hsrcstv += (crtc_hstart - si->overlay.ow.h_start);
                }
                LOG(4,("Overlay: clipping left...\n"));

                /* The calculated value is based on scaling = 1x. So we now compensate for scaling.
                 * Note that this also already takes care of aligning the value to the BES register! */
                hsrcstv *= (si->overlay.h_ifactor << 4);
        }
        /* take zoom into account */
        hsrcstv += ((uint32)si->overlay.my_ov.h_start) << 16;
        LOG(4,("Overlay: first hor. (sub)pixel of input bitmap contributing %f\n", hsrcstv / (float)65536));

        /* Setup horizontal source end: last (sub)pixel contributing to output picture */
        /* Note:
         * The method is to calculate, based on 1:1 scaling, based on the output window.
         * After this is done, include the scaling factor so you get a value based on the input bitmap.
         * Then add the right ending position of the bitmap's view (zoom function) to get the final value needed. */
        /* Note also:
         * Even if the scaling factor is clamping we instruct the BES to use the correct source end pos.! */

        moi->hsrcendv = 0;
        /* check for destination horizontal clipping at right side */
        if ((si->overlay.ow.h_start + si->overlay.ow.width - 1) > (crtc_hend - 1))
        {
                /* check if entire destination picture is clipping right:
                 * (2 pixels will be clamped onscreen at least) */
                if (si->overlay.ow.h_start > (crtc_hend - 2))
                {
                        /* increase 'number of clipping pixels' with 'fixed value': (total dest. width - 2) */
                        moi->hsrcendv += (si->overlay.ow.width - 2);
                }
                else
                {
                        /* increase 'number of clipping pixels' with actual number of dest. clipping pixels */
                        moi->hsrcendv += ((si->overlay.ow.h_start + si->overlay.ow.width - 1) - (crtc_hend - 1));
                }
                LOG(4,("Overlay: clipping right...\n"));

                /* The calculated value is based on scaling = 1x. So we now compensate for scaling.
                 * Note that this also already takes care of aligning the value to the BES register! */
                moi->hsrcendv *= (si->overlay.h_ifactor << 4);
                /* now subtract this value from the last used pixel in (zoomed) inputbuffer, aligned to BES */
                moi->hsrcendv = (((uint32)((si->overlay.my_ov.h_start + si->overlay.my_ov.width) - 1)) << 16) - moi->hsrcendv;
        }
        else
        {
                /* set last contributing pixel to last used pixel in (zoomed) inputbuffer, aligned to BES */
                moi->hsrcendv = (((uint32)((si->overlay.my_ov.h_start + si->overlay.my_ov.width) - 1)) << 16);
        }
        /* AND below required by hardware */
        moi->hsrcendv &= 0x03ffffff;
        LOG(4,("Overlay: last horizontal (sub)pixel of input bitmap contributing %f\n", moi->hsrcendv / (float)65536));


        /*******************************
         *** setup vertical clipping ***
         *******************************/

        /* calculate inputbitmap origin adress */
        moi->a1orgv = (uintptr_t)((vuint32 *)si->overlay.ob.buffer);
        moi->a1orgv -= (uintptr_t)((vuint32 *)si->framebuffer);
        LOG(4,("Overlay: topleft corner of input bitmap (cardRAM offset) $%08x\n", moi->a1orgv));

        /* Setup vertical source start: first (sub)pixel contributing to output picture. */
        /* Note:
         * The method is to calculate, based on 1:1 scaling, based on the output window.
         * 'After' this is done, include the scaling factor so you get a value based on the input bitmap. 
         * Then add the top starting position of the bitmap's view (zoom function) to get the final value needed. */
        /* Note also:
         * Even if the scaling factor is clamping we instruct the BES to use the correct source start pos.! */

        /* check for destination vertical clipping at top side */
        if (si->overlay.ow.v_start < crtc_vstart)
        {
                /* check if entire destination picture is clipping at top:
                 * (2 pixels will be clamped onscreen at least) */
                if ((si->overlay.ow.v_start + si->overlay.ow.height - 1) < (crtc_vstart + 1))
                {
                        /* increase source buffer origin with 'fixed value':
                         * (integer part of ('total height - 2' of dest. picture in pixels * inverse scaling factor)) *
                         * bytes per row source picture */
                        //fixme: rounding down would be better than just chopping off the fractional part...
                        moi->a1orgv +=
                                ((((si->overlay.ow.height - 2) * (si->overlay.v_ifactor << 4)) >> 16) *
                                si->overlay.ob.bytes_per_row);
                }
                else
                {
                        /* increase source buffer origin with:
                         * (integer part of (number of destination picture clipping pixels * inverse scaling factor)) *
                         * bytes per row source picture */
                        //fixme: rounding down would be better than just chopping off the fractional part...
                        moi->a1orgv +=
                                ((((crtc_vstart - si->overlay.ow.v_start) * (si->overlay.v_ifactor << 4)) >> 16) *
                                si->overlay.ob.bytes_per_row);
                }
                LOG(4,("Overlay: clipping at top...\n"));
        }
        /* take zoom into account */
        moi->a1orgv += (si->overlay.my_ov.v_start * si->overlay.ob.bytes_per_row);
        /* now include 'pixel precise' left clipping...
         * (subpixel precision is not supported by NeoMagic cards) */
        moi->a1orgv += ((hsrcstv >> 16) * 2);
        /* we need to step in 4-byte (2 pixel) granularity due to the nature of yuy2 */
        moi->a1orgv &= ~0x03;
        LOG(4,("Overlay: 'contributing part of buffer' origin is (cardRAM offset) $%08x\n", moi->a1orgv));
}

static void nm_bes_program_move_overlay(move_overlay_info moi)
{
        /*************************************
         *** sync to BES (Back End Scaler) ***
         *************************************/

        /* Make sure reprogramming the BES completes before the next retrace occurs,
         * to prevent register-update glitches (double buffer feature). */

        //fixme if needed...


        /**************************************
         *** actually program the registers ***
         **************************************/
        if (si->ps.card_type >= NM2097)
        {
                /* PCI card */
                LOG(4,("Overlay: accelerant is programming BES\n"));
                /* unlock card overlay sequencer registers (b5 = 1) */
                PCIGRPHW(GENLOCK, (PCIGRPHR(GENLOCK) | 0x20));
                /* destination rectangle #1 (output window position and size) */
                PCIGRPHW(HD1COORD1L, ((moi.hcoordv >> 16) & 0xff));
                PCIGRPHW(HD1COORD2L, (moi.hcoordv & 0xff));
                PCIGRPHW(HD1COORD21H, (((moi.hcoordv >> 4) & 0xf0) | ((moi.hcoordv >> 24) & 0x0f)));
                PCIGRPHW(VD1COORD1L, ((moi.vcoordv >> 16) & 0xff));
                PCIGRPHW(VD1COORD2L, (moi.vcoordv & 0xff));
                PCIGRPHW(VD1COORD21H, (((moi.vcoordv >> 4) & 0xf0) | ((moi.vcoordv >> 24) & 0x0f)));
                /* inputbuffer #1 origin */
                /* (we don't program buffer #2 as it's unused.) */
                if (si->ps.card_type < NM2200)
                {
                        moi.a1orgv >>= 1;
                        /* horizontal source end does not use subpixelprecision: granularity is 8 pixels */
                        /* notes:
                         * - correctly programming horizontal source end minimizes used bandwidth;
                         * - adding 9 below is in fact:
                         *   - adding 1 to round-up to the nearest whole source-end value
                               (making SURE we NEVER are a (tiny) bit too low);
                             - adding 1 to convert 'last used position' to 'number of used pixels';
                             - adding 7 to round-up to the nearest higher (or equal) valid register
                               value (needed because of it's 8-pixel granularity). */
                        PCIGRPHW(0xbc, ((((moi.hsrcendv >> 16) + 9) >> 3) - 1));
                }
                else
                {
                        /* horizontal source end does not use subpixelprecision: granularity is 16 pixels */
                        /* notes:
                         * - programming this register just a tiny bit too low messes up vertical
                         *   scaling badly (also distortion stripes and flickering are reported)!
                         * - not programming this register correctly will mess-up the picture when
                         *   it's partly clipping on the right side of the screen...
                         * - make absolutely sure the engine can fetch the last pixel needed from
                         *   the sourcebitmap even if only to generate a tiny subpixel from it!
                         *   (see remarks for < NM2200 cards regarding programming this register) */
                        PCIGRPHW(0xbc, ((((moi.hsrcendv >> 16) + 17) >> 4) - 1));
                }
                PCIGRPHW(BUF1ORGL, (moi.a1orgv & 0xff));
                PCIGRPHW(BUF1ORGM, ((moi.a1orgv >> 8) & 0xff));
                PCIGRPHW(BUF1ORGH, ((moi.a1orgv >> 16) & 0xff));
                /* ??? */
                PCIGRPHW(0xbd, 0x02);
                PCIGRPHW(0xbe, 0x00);
                /* b2 = 0: don't use horizontal mirroring (NM2160) */
                /* other bits do ??? */
                PCIGRPHW(0xbf, 0x02);
                /* ??? */
            PCISEQW(0x1c, 0xfb);
        PCISEQW(0x1d, 0x00);
                PCISEQW(0x1e, 0xe2);
        PCISEQW(0x1f, 0x02);
                /* b1 = 0: disable alternating hardware buffers (NM2160) */
                /* other bits do ??? */
                PCISEQW(0x09, 0x11);
                /* we don't use PCMCIA Zoomed Video port capturing, set 1:1 scale just in case */
                /* (b6-4 = Y downscale = 100%, b2-0 = X downscale = 100%;
                 *  downscaling selectable in 12.5% steps on increasing setting by 1) */
                PCISEQW(ZVCAP_DSCAL, 0x00);
        }
        else
        {
                /* bes setup data */
                nm_bes_data bi;

                /* ISA card. Speed required, so:
                 * program entire sequence in kerneldriver in one context switch! */
                LOG(4,("Overlay: kerneldriver programs BES\n"));

                /* setup BES info struct... */
                bi.moi = moi;
                bi.card_type = si->ps.card_type;
                bi.move_only = true;
                /* ... and call kerneldriver to program the BES */
                bi.magic = NM_PRIVATE_DATA_MAGIC;
                ioctl(fd, NM_PGM_BES, &bi, sizeof(bi));
        }
}

status_t nm_configure_bes
        (const overlay_buffer *ob, const overlay_window *ow, const overlay_view *ov, int offset)
{
        /* yuy2 (4:2:2) colorspace calculations */
        /* Note: Some calculations will have to be modified for other colorspaces if they are incorporated. */

        /* Note:
         * in BeOS R5.0.3 and DANO:
         * 'ow->offset_xxx' is always 0, so not used;
         * 'ow->width' and 'ow->height' are the output window size: does not change
         * if window is clipping;
         * 'ow->h_start' and 'ow->v_start' are the left-top position of the output
         * window. These values can be negative: this means the window is clipping
         * at the left or the top of the display, respectively. */

        /* 'ov' is the view in the source bitmap, so which part of the bitmap is actually
         * displayed on screen. This is used for the 'hardware zoom' function. */
 
        /* bes setup data */
        nm_bes_data bi;
        /* inverse scaling factor, used for source positioning */
        uint32 ifactor;
        /* copy of overlay view which has checked valid values */
        overlay_view my_ov;


        /**************************************************************************************
         *** copy, check and limit if needed the user-specified view into the intput bitmap ***
         **************************************************************************************/
        my_ov = *ov;
        /* check for valid 'coordinates' */
        if (my_ov.width == 0) my_ov.width++;
        if (my_ov.height == 0) my_ov.height++;
        if (my_ov.h_start > ((ob->width - si->overlay.myBufInfo[offset].slopspace) - 1))
                my_ov.h_start = ((ob->width - si->overlay.myBufInfo[offset].slopspace) - 1);
        if (((my_ov.h_start + my_ov.width) - 1) > ((ob->width - si->overlay.myBufInfo[offset].slopspace) - 1))
                my_ov.width = ((((ob->width - si->overlay.myBufInfo[offset].slopspace) - 1) - my_ov.h_start) + 1);
        if (my_ov.v_start > (ob->height - 1))
                my_ov.v_start = (ob->height - 1);
        if (((my_ov.v_start + my_ov.height) - 1) > (ob->height - 1))
                my_ov.height = (((ob->height - 1) - my_ov.v_start) + 1);

        LOG(6,("Overlay: inputbuffer view (zoom) left %d, top %d, width %d, height %d\n",
                my_ov.h_start, my_ov.v_start, my_ov.width, my_ov.height));

        /* save for nm_bes_calc_move_overlay() */
        si->overlay.ow = *ow;
        si->overlay.ob = *ob;
        si->overlay.my_ov = my_ov;


        /********************************
         *** setup horizontal scaling ***
         ********************************/

        LOG(6,("Overlay: total input picture width = %d, height = %d\n",
                        (ob->width - si->overlay.myBufInfo[offset].slopspace), ob->height));
        LOG(6,("Overlay: output picture width = %d, height = %d\n", ow->width, ow->height));

        /* calculate inverse horizontal scaling factor, taking zoom into account */
        ifactor = ((((uint32)my_ov.width) << 12) / ow->width); 

        /* correct factor to prevent most-right visible 'line' from distorting */
        ifactor -= 1;
        bi.hiscalv = ifactor;
        /* save for nv_bes_calc_move_overlay() */
        si->overlay.h_ifactor = ifactor;
        LOG(4,("Overlay: horizontal scaling factor is %f\n", (float)4096 / ifactor));

        /* check scaling factor (and modify if needed) to be within scaling limits */
        /* the upscaling limit is 8.0 (see official Neomagic specsheets) */
        if (bi.hiscalv < 0x00000200)
        {
                /* (non-inverse) factor too large, set factor to max. valid value */
                bi.hiscalv = 0x00000200;
                LOG(4,("Overlay: horizontal scaling factor too large, clamping at %f\n", (float)4096 / bi.hiscalv));
        }
        /* horizontal downscaling cannot be done by NM BES hardware */
        if (bi.hiscalv > (1 << 12))
        {
                /* (non-inverse) factor too small, set factor to min. valid value */
                bi.hiscalv = 0x1000;
                LOG(4,("Overlay: horizontal scaling factor too small, clamping at %f\n", (float)4096 / bi.hiscalv));
        }


        /******************************
         *** setup vertical scaling ***
         ******************************/

        /* calculate inverse vertical scaling factor, taking zoom into account */
        ifactor = ((((uint32)my_ov.height) << 12) / ow->height); 

        /* correct factor to prevent lowest visible line from distorting */
        ifactor -= 1;
        LOG(4,("Overlay: vertical scaling factor is %f\n", (float)4096 / ifactor));

        /* preserve ifactor for source positioning calculations later on */
        bi.viscalv = ifactor;
        /* save for nv_bes_calc_move_overlay() */
        si->overlay.v_ifactor = ifactor;

        /* check scaling factor (and modify if needed) to be within scaling limits */
        /* the upscaling limit is 8.0 (see official Neomagic specsheets) */
        if (bi.viscalv < 0x00000200)
        {
                /* (non-inverse) factor too large, set factor to max. valid value */
                bi.viscalv = 0x00000200;
                LOG(4,("Overlay: vertical scaling factor too large, clamping at %f\n", (float)4096 / bi.viscalv));
        }
        /* vertical downscaling cannot be done by NM BES hardware */
        if (bi.viscalv > (1 << 12))
        {
                /* (non-inverse) factor too small, set factor to min. valid value */
                bi.viscalv = 0x1000;
                LOG(4,("Overlay: vertical scaling factor too small, clamping at %f\n", (float)4096 / bi.viscalv));
        }


        /********************************************************************************
         *** setup all edges of output window, setup horizontal and vertical clipping ***
         ********************************************************************************/
        nm_bes_calc_move_overlay(&(bi.moi));


        /*****************************
         *** log color keying info ***
         *****************************/

        LOG(6,("Overlay: key_red %d, key_green %d, key_blue %d, key_alpha %d\n",
                ow->red.value, ow->green.value, ow->blue.value, ow->alpha.value));
        LOG(6,("Overlay: mask_red %d, mask_green %d, mask_blue %d, mask_alpha %d\n",
                ow->red.mask, ow->green.mask, ow->blue.mask, ow->alpha.mask));


        /*************************
         *** setup BES control ***
         *************************/

        /* BES global control: setup functions */
        bi.globctlv = 0;

        /* enable BES */
        bi.globctlv |= 1 << 0;
        /* enable colorkeying if requested */
        if (ow->flags & B_OVERLAY_COLOR_KEY) bi.globctlv |= 1 << 1;
        /* b3 = 1: distorts right-half of overlay output. Keeping it zero. */
        /* colorspace is YV12, I420 or YUY2 (no RV15 or RV16) */
        bi.globctlv |= 0 << 5;

        /* enable auto-alternating hardware buffers if alternating buffers is enabled (NM2160) */
        bi.globctlv |= 1 << 8;
        /* disable capture */
        bi.globctlv |= 1 << 13;
        /* capture: display one buffer (no alternating buffers) */
        bi.globctlv |= 0 << 14;
        /* capture: display frame (no field) */
        bi.globctlv |= 0 << 15;

        /* BTW: horizontal and vertical filtering are always turned on in NM hardware. */


        /*************************************
         *** sync to BES (Back End Scaler) ***
         *************************************/

        /* Make sure reprogramming the BES completes before the next retrace occurs,
         * to prevent register-update glitches (double buffer feature). */

        //fixme if needed...


        /**************************************
         *** actually program the registers ***
         **************************************/

        if (si->ps.card_type >= NM2097)
        {
                /* helper: some cards use pixels to define buffer pitch, others use bytes */
                uint16 buf_pitch = ob->width;

                /* PCI card */
                LOG(4,("Overlay: accelerant is programming BES\n"));
                /* unlock card overlay sequencer registers (b5 = 1) */
                PCIGRPHW(GENLOCK, (PCIGRPHR(GENLOCK) | 0x20));
                /* destination rectangle #1 (output window position and size) */
                PCIGRPHW(HD1COORD1L, ((bi.moi.hcoordv >> 16) & 0xff));
                PCIGRPHW(HD1COORD2L, (bi.moi.hcoordv & 0xff));
                PCIGRPHW(HD1COORD21H, (((bi.moi.hcoordv >> 4) & 0xf0) | ((bi.moi.hcoordv >> 24) & 0x0f)));
                PCIGRPHW(VD1COORD1L, ((bi.moi.vcoordv >> 16) & 0xff));
                PCIGRPHW(VD1COORD2L, (bi.moi.vcoordv & 0xff));
                PCIGRPHW(VD1COORD21H, (((bi.moi.vcoordv >> 4) & 0xf0) | ((bi.moi.vcoordv >> 24) & 0x0f)));
                /* scaling */
                PCIGRPHW(XSCALEL, (bi.hiscalv & 0xff));
                PCIGRPHW(XSCALEH, ((bi.hiscalv >> 8) & 0xff));
                PCIGRPHW(YSCALEL, (bi.viscalv & 0xff));
                PCIGRPHW(YSCALEH, ((bi.viscalv >> 8) & 0xff));
                /* inputbuffer #1 origin */
                /* (we don't program buffer #2 as it's unused.) */
                if (si->ps.card_type < NM2200)
                {
                        bi.moi.a1orgv >>= 1;
                        /* horizontal source end does not use subpixelprecision: granularity is 8 pixels */
                        /* notes:
                         * - correctly programming horizontal source end minimizes used bandwidth;
                         * - adding 9 below is in fact:
                         *   - adding 1 to round-up to the nearest whole source-end value
                               (making SURE we NEVER are a (tiny) bit too low);
                             - adding 1 to convert 'last used position' to 'number of used pixels';
                             - adding 7 to round-up to the nearest higher (or equal) valid register
                               value (needed because of it's 8-pixel granularity). */
                        PCIGRPHW(0xbc, ((((bi.moi.hsrcendv >> 16) + 9) >> 3) - 1));
                }
                else
                {
                        /* NM2200 and later cards use bytes to define buffer pitch */
                        buf_pitch <<= 1;
                        /* horizontal source end does not use subpixelprecision: granularity is 16 pixels */
                        /* notes:
                         * - programming this register just a tiny bit too low messes up vertical
                         *   scaling badly (also distortion stripes and flickering are reported)!
                         * - not programming this register correctly will mess-up the picture when
                         *   it's partly clipping on the right side of the screen...
                         * - make absolutely sure the engine can fetch the last pixel needed from
                         *   the sourcebitmap even if only to generate a tiny subpixel from it!
                         *   (see remarks for < NM2200 cards regarding programming this register) */
                        PCIGRPHW(0xbc, ((((bi.moi.hsrcendv >> 16) + 17) >> 4) - 1));
                }
                PCIGRPHW(BUF1ORGL, (bi.moi.a1orgv & 0xff));
                PCIGRPHW(BUF1ORGM, ((bi.moi.a1orgv >> 8) & 0xff));
                PCIGRPHW(BUF1ORGH, ((bi.moi.a1orgv >> 16) & 0xff));
                /* ??? */
                PCIGRPHW(0xbd, 0x02);
                PCIGRPHW(0xbe, 0x00);
                /* b2 = 0: don't use horizontal mirroring (NM2160) */
                /* other bits do ??? */
                PCIGRPHW(0xbf, 0x02);
                /* ??? */
            PCISEQW(0x1c, 0xfb);
        PCISEQW(0x1d, 0x00);
                PCISEQW(0x1e, 0xe2);
        PCISEQW(0x1f, 0x02);
                /* b1 = 0: disable alternating hardware buffers (NM2160) */
                /* other bits do ??? */
                PCISEQW(0x09, 0x11);
                /* we don't use PCMCIA Zoomed Video port capturing, set 1:1 scale just in case */
                /* (b6-4 = Y downscale = 100%, b2-0 = X downscale = 100%;
                 *  downscaling selectable in 12.5% steps on increasing setting by 1) */
                PCISEQW(ZVCAP_DSCAL, 0x00);
                /* global BES control */
                PCIGRPHW(BESCTRL1, (bi.globctlv & 0xff));
                PCISEQW(BESCTRL2, ((bi.globctlv >> 8) & 0xff));


                /**************************
                 *** setup color keying ***
                 **************************/

                PCIGRPHW(COLKEY_R, (ow->red.value & ow->red.mask));
                PCIGRPHW(COLKEY_G, (ow->green.value & ow->green.mask));
                PCIGRPHW(COLKEY_B, (ow->blue.value & ow->blue.mask));


                /*************************
                 *** setup misc. stuff ***
                 *************************/

                /* setup brightness to be 'neutral' (two's complement number) */
                PCIGRPHW(BRIGHTNESS, 0x00);

                /* setup inputbuffer #1 pitch including slopspace */
                /* (we don't program the pitch for inputbuffer #2 as it's unused.) */
                PCIGRPHW(BUF1PITCHL, (buf_pitch & 0xff));
                PCIGRPHW(BUF1PITCHH, ((buf_pitch >> 8) & 0xff));
        }
        else
        {
                /* ISA card. Speed required, so:
                 * program entire sequence in kerneldriver in one context switch! */
                LOG(4,("Overlay: kerneldriver programs BES\n"));

                /* complete BES info struct... */
                bi.card_type = si->ps.card_type;
                bi.colkey_r = (ow->red.value & ow->red.mask);
                bi.colkey_g = (ow->green.value & ow->green.mask);
                bi.colkey_b = (ow->blue.value & ow->blue.mask);
                bi.ob_width = ob->width;
                bi.move_only = false;
                /* ... and call kerneldriver to program the BES */
                bi.magic = NM_PRIVATE_DATA_MAGIC;
                ioctl(fd, NM_PGM_BES, &bi, sizeof(bi));
        }

        /* note that overlay is in use (for nm_bes_move_overlay()) */
        si->overlay.active = true;

        return B_OK;
}

status_t nm_release_bes()
{
        /* setup BES control: disable scaler */
        if (si->ps.card_type >= NM2097)
        {
                /* PCI card */
                PCIGRPHW(BESCTRL1, 0x02);
                PCISEQW(BESCTRL2, 0xa0);
        }
        else
        {
                /* ISA card */
                ISAGRPHW(BESCTRL1, 0x02);
                ISASEQW(BESCTRL2, 0xa0);
        }

        /* note that overlay is not in use (for nm_bes_move_overlay()) */
        si->overlay.active = false;

        return B_OK;
}