root/src/add-ons/accelerants/radeon/overlay.c
/*
        Copyright (c) 2002-2004, Thomas Kurschel
        
        Part of Radeon accelerant
                
        Hardware access routines for overlays
*/

#include "GlobalData.h"
#include "radeon_interface.h"
#include "mmio.h"
#include "overlay_regs.h"
#include "pll_regs.h"
#include "capture_regs.h"
#include "utils.h"
#include "pll_access.h"
#include <math.h>
#include <string.h>
#include "CP.h"


void Radeon_TempHideOverlay( accelerator_info *ai );

// standard (linear) gamma
static struct {
    uint16 reg;
    bool r200_or_above;
    uint32 slope;
    uint32 offset;
} std_gamma[] = {
    { RADEON_OV0_GAMMA_0_F, false, 0x100, 0x0000 },
    { RADEON_OV0_GAMMA_10_1F, false, 0x100, 0x0020 },
    { RADEON_OV0_GAMMA_20_3F, false, 0x100, 0x0040 },
    { RADEON_OV0_GAMMA_40_7F, false, 0x100, 0x0080 },
    { RADEON_OV0_GAMMA_80_BF, true, 0x100, 0x0100 },
    { RADEON_OV0_GAMMA_C0_FF, true, 0x100, 0x0100 },
    { RADEON_OV0_GAMMA_100_13F, true, 0x100, 0x0200 },
    { RADEON_OV0_GAMMA_140_17F, true, 0x100, 0x0200 },
    { RADEON_OV0_GAMMA_180_1BF, true, 0x100, 0x0300 },
    { RADEON_OV0_GAMMA_1C0_1FF, true, 0x100, 0x0300 },
    { RADEON_OV0_GAMMA_200_23F, true, 0x100, 0x0400 },
    { RADEON_OV0_GAMMA_240_27F, true, 0x100, 0x0400 },
    { RADEON_OV0_GAMMA_280_2BF, true, 0x100, 0x0500 },
    { RADEON_OV0_GAMMA_2C0_2FF, true, 0x100, 0x0500 },
    { RADEON_OV0_GAMMA_300_33F, true, 0x100, 0x0600 },
    { RADEON_OV0_GAMMA_340_37F, true, 0x100, 0x0600 },
    { RADEON_OV0_GAMMA_380_3BF, false, 0x100, 0x0700 },
    { RADEON_OV0_GAMMA_3C0_3FF, false, 0x100, 0x0700 }
};


// setup overlay unit before first use
void Radeon_InitOverlay( 
        accelerator_info *ai, int crtc_idx )
{
        vuint8 *regs = ai->regs;
        shared_info *si = ai->si;
        uint i;
        uint32 ecp_div;
        
        SHOW_FLOW0( 0, "" );
        
        // make sure we really write this value as the "toggle" bit
        // contained in it (which is zero initially) is edge-sensitive!
        // for capturing, we need to select "software" video port
        si->overlay_mgr.auto_flip_reg = RADEON_OV0_VID_PORT_SELECT_SOFTWARE;
        
        OUTREG( regs, RADEON_OV0_SCALE_CNTL, RADEON_SCALER_SOFT_RESET );
        OUTREG( regs, RADEON_OV0_AUTO_FLIP_CNTRL, si->overlay_mgr.auto_flip_reg );
        OUTREG( regs, RADEON_OV0_FILTER_CNTL,                   // use fixed filter coefficients
                RADEON_OV0_HC_COEF_ON_HORZ_Y |
                RADEON_OV0_HC_COEF_ON_HORZ_UV |
                RADEON_OV0_HC_COEF_ON_VERT_Y |
                RADEON_OV0_HC_COEF_ON_VERT_UV );
        OUTREG( regs, RADEON_OV0_KEY_CNTL, RADEON_GRAPHIC_KEY_FN_EQ |
                RADEON_VIDEO_KEY_FN_FALSE |
                RADEON_CMP_MIX_OR );
        OUTREG( regs, RADEON_OV0_TEST, 0 );
//      OUTREG( regs, RADEON_FCP_CNTL, RADEON_FCP_CNTL_GND );   // disable capture clock
//      OUTREG( regs, RADEON_CAP0_TRIG_CNTL, 0 );                               // disable capturing
        OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, 0 );
        // tell deinterlacer to always show recent field
        OUTREG( regs, RADEON_OV0_DEINTERLACE_PATTERN, 
                0xaaaaa | (9 << RADEON_OV0_DEINT_PAT_LEN_M1_SHIFT) );
        
        // set gamma
        for( i = 0; i < sizeof( std_gamma ) / sizeof( std_gamma[0] ); ++i ) {
                if( !std_gamma[i].r200_or_above || si->asic >= rt_r200 ) {
                        OUTREG( regs, std_gamma[i].reg, 
                                (std_gamma[i].slope << 16) | std_gamma[i].offset );
                }
        }
        
        // overlay unit can only handle up to 175 MHz, if pixel clock is higher,
        // only every second pixel is handled
        if( si->crtc[crtc_idx].mode.timing.pixel_clock < 175000 )
                ecp_div = 0;
        else
                ecp_div = 1;

        Radeon_OUTPLLP( regs, si->asic, RADEON_VCLK_ECP_CNTL, 
                ecp_div << RADEON_ECP_DIV_SHIFT, ~RADEON_ECP_DIV_MASK );

        // Force the overlay clock on for integrated chips
        if ((si->asic == rt_rs100) || 
        (si->asic == rt_rs200) ||
        (si->asic == rt_rs300)) {
                Radeon_OUTPLL( regs, si->asic, RADEON_VCLK_ECP_CNTL,
                (Radeon_INPLL( regs, si->asic, RADEON_VCLK_ECP_CNTL) | (1<<18)));
    }
    
        si->active_overlay.crtc_idx = si->pending_overlay.crtc_idx;
        
        // invalidate active colour space
        si->active_overlay.ob.space = -1;
        
        // invalidate position/scaling
        si->active_overlay.ob.width = -1;
}

// colour space transformation matrix
typedef struct space_transform
{
    float   RefLuma;    // scaling of luma to use full RGB range
    float   RefRCb;             // b/u -> r
    float   RefRY;              // g/y -> r
    float   RefRCr;             // r/v -> r
    float   RefGCb;
    float   RefGY;
    float   RefGCr;
    float   RefBCb;
    float   RefBY;
    float   RefBCr;
} space_transform;


// Parameters for ITU-R BT.601 and ITU-R BT.709 colour spaces
space_transform trans_yuv[2] =
{
    { 1.1678, 0.0, 1, 1.6007, -0.3929, 1, -0.8154, 2.0232, 1, 0.0 }, /* BT.601 */
    { 1.1678, 0.0, 1, 1.7980, -0.2139, 1, -0.5345, 2.1186, 1, 0.0 }  /* BT.709 */
};


// RGB is a pass through
space_transform trans_rgb =
        { 1, 0, 0, 1, 0, 1, 0, 1, 0, 0 };


// set overlay colour space transformation matrix
static void Radeon_SetTransform( 
        accelerator_info *ai,
        float       bright,
        float       cont,
        float       sat, 
        float       hue,
        float       red_intensity, 
        float       green_intensity, 
        float       blue_intensity,
        uint        ref)
{
        vuint8 *regs = ai->regs;
        shared_info *si = ai->si;
        float       OvHueSin, OvHueCos;
        float       CAdjOff;
        float           CAdjRY, CAdjGY, CAdjBY;
        float       CAdjRCb, CAdjRCr;
        float       CAdjGCb, CAdjGCr;
        float       CAdjBCb, CAdjBCr;
        float       RedAdj,GreenAdj,BlueAdj;
        float       OvROff, OvGOff, OvBOff;
        float           OvRY, OvGY, OvBY;
        float       OvRCb, OvRCr;
        float       OvGCb, OvGCr;
        float       OvBCb, OvBCr;
        float       Loff;
        float       Coff;
        
        uint32      dwOvROff, dwOvGOff, dwOvBOff;
        uint32          dwOvRY, dwOvGY, dwOvBY;
        uint32      dwOvRCb, dwOvRCr;
        uint32      dwOvGCb, dwOvGCr;
        uint32      dwOvBCb, dwOvBCr;
        
        space_transform *trans;
        
        SHOW_FLOW0( 0, "" );

        // get proper conversion formula
        switch( si->pending_overlay.ob.space ) {
        case B_YCbCr422:
        case B_YUV12:
                Loff = 16 * 4;          // internal representation is 10 Bits
                Coff = 128 * 4;
                
                if (ref >= 2) 
                        ref = 0;
                
                trans = &trans_yuv[ref];
                break;
                
        case B_RGB15:
        case B_RGB16:
        case B_RGB32:
        default:
                Loff = 0;
                Coff = 0;
                trans = &trans_rgb;
        }
        
        OvHueSin = sin(hue);
        OvHueCos = cos(hue);
        
        // get matrix values to convert overlay colour space to RGB
        // applying colour adjustment, saturation and luma scaling
        // (saturation doesn't work with RGB input, perhaps it did with some
        //  maths; this is left to the reader :)
        CAdjRY = cont * trans->RefLuma * trans->RefRY;
        CAdjGY = cont * trans->RefLuma * trans->RefGY;
        CAdjBY = cont * trans->RefLuma * trans->RefBY;
        
        CAdjRCb = sat * -OvHueSin * trans->RefRCr;
        CAdjRCr = sat * OvHueCos * trans->RefRCr;
        CAdjGCb = sat * (OvHueCos * trans->RefGCb - OvHueSin * trans->RefGCr);
        CAdjGCr = sat * (OvHueSin * trans->RefGCb + OvHueCos * trans->RefGCr);
        CAdjBCb = sat * OvHueCos * trans->RefBCb;
        CAdjBCr = sat * OvHueSin * trans->RefBCb;
        
        // adjust black level
        CAdjOff = cont * trans[ref].RefLuma * bright * 1023.0;
        RedAdj = cont * trans[ref].RefLuma * red_intensity * 1023.0;
        GreenAdj = cont * trans[ref].RefLuma * green_intensity * 1023.0;
        BlueAdj = cont * trans[ref].RefLuma * blue_intensity * 1023.0;
        
        OvRY = CAdjRY;
        OvGY = CAdjGY;
        OvBY = CAdjBY;
        OvRCb = CAdjRCb;
        OvRCr = CAdjRCr;
        OvGCb = CAdjGCb;
        OvGCr = CAdjGCr;
        OvBCb = CAdjBCb;
        OvBCr = CAdjBCr;
        // apply offsets
        OvROff = RedAdj + CAdjOff -     CAdjRY * Loff - (OvRCb + OvRCr) * Coff;
        OvGOff = GreenAdj + CAdjOff - CAdjGY * Loff - (OvGCb + OvGCr) * Coff;
        OvBOff = BlueAdj + CAdjOff - CAdjBY * Loff - (OvBCb + OvBCr) * Coff;
        
        dwOvROff = ((int32)(OvROff * 2.0)) & 0x1fff;
        dwOvGOff = ((int32)(OvGOff * 2.0)) & 0x1fff;
        dwOvBOff = ((int32)(OvBOff * 2.0)) & 0x1fff;

        dwOvRY = (((int32)(OvRY * 2048.0))&0x7fff)<<17;
        dwOvGY = (((int32)(OvGY * 2048.0))&0x7fff)<<17;
        dwOvBY = (((int32)(OvBY * 2048.0))&0x7fff)<<17;
        dwOvRCb = (((int32)(OvRCb * 2048.0))&0x7fff)<<1;
        dwOvRCr = (((int32)(OvRCr * 2048.0))&0x7fff)<<17;
        dwOvGCb = (((int32)(OvGCb * 2048.0))&0x7fff)<<1;
        dwOvGCr = (((int32)(OvGCr * 2048.0))&0x7fff)<<17;
        dwOvBCb = (((int32)(OvBCb * 2048.0))&0x7fff)<<1;
        dwOvBCr = (((int32)(OvBCr * 2048.0))&0x7fff)<<17;

        OUTREG( regs, RADEON_OV0_LIN_TRANS_A, dwOvRCb | dwOvRY );
        OUTREG( regs, RADEON_OV0_LIN_TRANS_B, dwOvROff | dwOvRCr );
        OUTREG( regs, RADEON_OV0_LIN_TRANS_C, dwOvGCb | dwOvGY );
        OUTREG( regs, RADEON_OV0_LIN_TRANS_D, dwOvGOff | dwOvGCr );
        OUTREG( regs, RADEON_OV0_LIN_TRANS_E, dwOvBCb | dwOvBY );
        OUTREG( regs, RADEON_OV0_LIN_TRANS_F, dwOvBOff | dwOvBCr );
        
        si->active_overlay.ob.space = si->pending_overlay.ob.space;
}


// convert Be colour key to rgb value
static uint32 colourKey2RGB32( 
        uint32 space, uint8 red, uint8 green, uint8 blue ) 
{
        uint32 res;
        
        SHOW_FLOW0( 3, "" );
        
        // the way Be defines colour keys may be convinient to some driver developers,
        // but it's not well defined - took me some time to find out the format used
        // and still I have no idea how alpha is defined; Rudolf told me that alpha is
        // never used
        switch( space ) {
        case B_RGB15:
                res = 
                        ((uint32)(red >> 0) << (16+3)) | 
                        ((uint32)(green >> 0) << (8+3)) | 
                        ((blue >> 0) << 3);
                break;
        case B_RGB16:
                res = 
                        ((uint32)(red >> 0) << (16+3)) | 
                        ((uint32)(green >> 0) << (8+2)) | 
                        ((blue >> 0) << 3);
                break;
        case B_RGB32:
        case B_CMAP8:
                res = ((uint32)(red) << 16) | ((uint32)(green) << 8) | blue;
                break;
        default:
                res = 0;
        }
        
        SHOW_FLOW( 3, "key=%lx", res );
        return res;
}


// set colour key of overlay
static void Radeon_SetColourKey( 
        accelerator_info *ai, const overlay_window *ow )
{
        virtual_card *vc = ai->vc;
        vuint8 *regs = ai->regs;
        uint32 rgb32, mask32, min32, max32;
        
        /*SHOW_FLOW( 0, "value=%02x %02x %02x, mask=%02x %02x %02x",
                ow->red.value, ow->green.value, ow->blue.value,
                ow->red.mask, ow->green.mask, ow->blue.mask );*/
        
        // Radeons don't support value and mask as colour key but colour range
        rgb32 = colourKey2RGB32( vc->mode.space, 
                ow->red.value, ow->green.value, ow->blue.value );
        mask32 = colourKey2RGB32( vc->mode.space,
                ow->red.mask, ow->green.mask, ow->blue.mask );

        // ~mask32 are all unimportant (usually low order) bits 
        // oring this to the colour should give us the highest valid colour value
        // (add would be more precise but may lead to overflows)
        min32 = rgb32;
        max32 = rgb32 | ~mask32;
        
        OUTREG( regs, RADEON_OV0_GRAPHICS_KEY_CLR_LOW, min32 );
        OUTREG( regs, RADEON_OV0_GRAPHICS_KEY_CLR_HIGH, max32 );
        OUTREG( regs, RADEON_OV0_KEY_CNTL, 
                RADEON_GRAPHIC_KEY_FN_EQ |
                RADEON_VIDEO_KEY_FN_FALSE |
                RADEON_CMP_MIX_OR );
}

typedef struct {
        uint max_scale;                                 // maximum src_width/dest_width, 
                                                                        // i.e. source increment per screen pixel
        uint8 group_size;                               // size of one filter group in pixels
        uint8 p1_step_by, p23_step_by;  // > 0: log(source pixel increment)+1, 2-tap filter
                                                                        // = 0: source pixel increment = 1, 4-tap filter
} hscale_factor;


// scaling/filter tables depending on overlay colour space:
// magnifying pixels is no problem, but minifying can lead to overload,
// so we have to skip pixels and/or use 2-tap filters
static hscale_factor scale_RGB16[] = {
        { (2 << 12),            2, 1, 1 },
        { (4 << 12),            2, 2, 2 },
        { (8 << 12),            2, 3, 3 },
        { (16 << 12),           2, 4, 4 },
        { (32 << 12),           2, 5, 5 }
};

static hscale_factor scale_RGB32[] = {
        { (2 << 12) / 3,        2, 0, 0 },
        { (4 << 12) / 3,        4, 1, 1 },
        { (8 << 12) / 3,        4, 2, 2 },
        { (4 << 12),            4, 2, 3 },
        { (16 << 12) / 3,       4, 3, 3 },
        { (8 << 12),            4, 3, 4 },
        { (32 << 12) / 3,       4, 4, 4 },
        { (16 << 12),           4, 5, 5 }
};

static hscale_factor scale_YUV[] = {
        { (16 << 12) / 16,      2, 0, 0 },
        { (16 << 12) / 12,      2, 0, 1 },      // mode 4, 1, 0 (as used by YUV12) is impossible
        { (16 << 12) / 8,       4, 1, 1 },
        { (16 << 12) / 6,       4, 1, 2 },
        { (16 << 12) / 4,       4, 2, 2 },
        { (16 << 12) / 3,       4, 2, 3 },
        { (16 << 12) / 2,       4, 3, 3 },
        { (16 << 12) / 1,       4, 4, 4 }
};

static hscale_factor scale_YUV12[] = {
        { (16 << 12) / 16,                      2, 0, 0 },
        { (16 << 12) / 12,                      4, 1, 0 },      
        { (16 << 12) / 12,                      2, 0, 1 },      
        { (16 << 12) / 8,                       4, 1, 1 },
        { (16 << 12) / 6,                       4, 1, 2 },
        { (16 << 12) / 4,                       4, 2, 2 },
        { (16 << 12) / 3,                       4, 2, 3 },
        { (16 << 12) / 2,                       4, 3, 3 },
        { (int)((16 << 12) / 1.5),      4, 3, 4 },
        { (int)((16 << 12) / 1.0),      4, 4, 4 },
        { (int)((16 << 12) / 0.75),     4, 4, 5 },
        { (int)((16 << 12) / 0.5),      4, 5, 5 }
};

#define min3( a, b, c ) (min( (a), min( (b), (c) )))

static hscale_factor scale_YUV9[] = {
        { min3( (16 << 12) / 12,        (3 << 12) * 1,  (2 << 12) * 4 * 1 ),    2, 0, 0 },
        { min3( (16 << 12) / 8,         (3 << 12) * 1,  (2 << 12) * 4 * 1 ),    4, 1, 0 },
        { min3( (16 << 12) / 10,        (3 << 12) * 1,  (2 << 12) * 4 * 1 ),    2, 0, 1 },
        { min3( (16 << 12) / 6,         (3 << 12) * 1,  (2 << 12) * 4 * 1 ),    4, 1, 1 },
        { min3( (16 << 12) / 5,         (3 << 12) * 1,  (2 << 12) * 4 * 2 ),    4, 1, 2 },
        { min3( (16 << 12) / 3,         (3 << 12) * 2,  (2 << 12) * 4 * 2 ),    4, 2, 2 },
        { min3( (int)((16 << 12) / 2.5),        (3 << 12) * 1,  (2 << 12) * 4 * 4 ),    4, 2, 3 },      // probably, it should be (3 << 12) * 2
        { min3( (int)((16 << 12) / 1.5),        (3 << 12) * 4,  (2 << 12) * 4 * 4 ),    4, 3, 3 },
        { min3( (int)((16 << 12) / 0.75),       (3 << 12) * 8,  (2 << 12) * 4 * 8 ),    4, 4, 4 },
        { min3( (int)((16 << 12) / 0.625),      (3 << 12) * 8,  (2 << 12) * 4 * 16 ),   4, 4, 5 },
        { min3( (int)((16 << 12) / 0.375),      (3 << 12) * 16, (2 << 12) * 4 * 16 ),   4, 5, 5 }
};


// parameters of an overlay colour space
typedef struct {
        uint8 bpp_shift;                                // log2( bytes per pixel (main plain) )
        uint8 bpuv_shift;                               // log2( bytes per pixel (uv-plane) ); 
                                                                        // if there is one plane only: bpp=bpuv
        uint8 num_planes;                               // number of planes
        uint8 h_uv_sub_sample_shift;    // log2( horizontal pixels per uv pair )
        uint8 v_uv_sub_sample_shift;    // log2( vertical pixels per uv pair )
        hscale_factor *factors;                 // scaling/filter table
        uint8 num_factors;
} space_params;

static space_params space_params_table[16] = {
        { 0, 0, 0, 0, 0, NULL, 0 },     // reserved
        { 0, 0, 0, 0, 0, NULL, 0 },     // reserved
        { 0, 0, 0, 0, 0, NULL, 0 },     // reserved
        { 1, 1, 1, 0, 0, scale_RGB16, B_COUNT_OF( scale_RGB16 ) },      // RGB15
        { 1, 1, 1, 0, 0, scale_RGB16, B_COUNT_OF( scale_RGB16 ) },      // RGB16
        { 0, 0, 0, 0, 0, NULL, 0 },     // reserved
        { 2, 2, 1, 0, 0, scale_RGB32, B_COUNT_OF( scale_RGB32 ) },      // RGB32
        { 0, 0, 0, 0, 0, NULL, 0 },     // reserved
        { 0, 0, 0, 0, 0, NULL, 0 },     // reserved
        { 0, 0, 3, 2, 2, scale_YUV9, B_COUNT_OF( scale_YUV9 ) },                // YUV9
        { 0, 0, 3, 1, 1, scale_YUV12, B_COUNT_OF( scale_YUV12 ) },      // YUV12, three-plane
        { 1, 1, 1, 1, 0, scale_YUV, B_COUNT_OF( scale_YUV ) },          // VYUY422
        { 1, 1, 1, 1, 0, scale_YUV, B_COUNT_OF( scale_YUV ) },          // YVYU422
        { 0, 1, 2, 1, 1, scale_YUV12, B_COUNT_OF( scale_YUV12 ) },      // YUV12, two-plane
        { 0, 1, 2, 1, 1, NULL, 0 },     // ???
        { 0, 0, 0, 0, 0, NULL, 0 }      // reserved
};

// get appropriate scaling/filter parameters
static hscale_factor *getHScaleFactor( 
        accelerator_info *ai,
        space_params *params, 
        uint32 src_left, uint32 src_right, uint32 *h_inc )
{
        uint words_per_p1_line, words_per_p23_line, max_words_per_line;
        bool p1_4tap_allowed, p23_4tap_allowed;
        uint i;
        uint num_factors;
        hscale_factor *factors;

        SHOW_FLOW0( 3, "" );

        // check whether fifo is large enough to feed vertical 4-tap-filter

        words_per_p1_line = 
                ceilShiftDiv( (src_right - 1) << params->bpp_shift, 4 ) - 
                ((src_left << params->bpp_shift) >> 4) + 1;
        words_per_p23_line = 
                ceilShiftDiv( (src_right - 1) << params->bpuv_shift, 4 ) - 
                ((src_left << params->bpuv_shift) >> 4) + 1;

        // overlay scaler line length differs for different revisions 
        // this needs to be maintained by hand 
        if (ai->si->asic == rt_r200 || ai->si->asic >= rt_r300)
                max_words_per_line = 1920 / 16;
        else
                max_words_per_line = 1536 / 16;

        switch (params->num_planes) {
                case 3:
                        p1_4tap_allowed = words_per_p1_line < max_words_per_line / 2;
                        p23_4tap_allowed = words_per_p23_line < max_words_per_line / 4;
                        break;
                case 2:
                        p1_4tap_allowed = words_per_p1_line < max_words_per_line / 2;
                        p23_4tap_allowed = words_per_p23_line < max_words_per_line / 2;
                        break;
                case 1:
                default:
                        p1_4tap_allowed = p23_4tap_allowed = words_per_p1_line < max_words_per_line;
                        break;
        }

        SHOW_FLOW( 3, "p1_4tap_allowed=%d, p23_4t_allowed=%d", 
                (int)p1_4tap_allowed, (int)p23_4tap_allowed );

        // search for proper scaling/filter entry
        factors = params->factors;
        num_factors = params->num_factors;

        if (factors == NULL || num_factors == 0)
                return NULL;

        for (i = 0; i < num_factors; ++i, ++factors) {
                if (*h_inc <= factors->max_scale
                        && (factors->p1_step_by > 0 || p1_4tap_allowed)
                        && (factors->p23_step_by > 0 || p23_4tap_allowed))
                        break;
        }

        if (i == num_factors) {
                // overlay is asked to be scaled down more than allowed,
                // so use least scaling factor supported
                --factors;
                *h_inc = factors->max_scale;
        }

        SHOW_FLOW( 3, "group_size=%d, p1_step_by=%d, p23_step_by=%d", 
                factors->group_size, factors->p1_step_by, factors->p23_step_by );

        return factors;
}                       


#define I2FF( a, shift ) ((uint32)((a) * (1 << (shift))))


// show overlay on screen
static status_t Radeon_ShowOverlay( 
        accelerator_info *ai, int crtc_idx )
{
        virtual_card *vc = ai->vc;
        shared_info *si = ai->si;
        vuint8 *regs = ai->regs;
        overlay_info *overlay = &si->pending_overlay;
        overlay_buffer_node *node = overlay->on;
        crtc_info *crtc = &si->crtc[crtc_idx];

        uint32 ecp_div;
        uint32 v_inc, h_inc;
        uint32 src_v_inc, src_h_inc;
        uint32 src_left, src_top, src_right, src_bottom;
        int32 dest_left, dest_top, dest_right, dest_bottom;
        uint32 offset;
        uint32 tmp;
        uint32 p1_h_accum_init, p23_h_accum_init, p1_v_accum_init, p23_v_accum_init;
        uint32 p1_active_lines, p23_active_lines;
        hscale_factor *factors;
        space_params *params;
        
        uint32 p1_h_inc, p23_h_inc;
        uint32 p1_x_start, p1_x_end;
        uint32 p23_x_start, p23_x_end;
        
        uint scale_ctrl;
                
        /*uint32 buffer[20*2];
        uint idx = 0;*/
        
        SHOW_FLOW0( 0, "" );
        
        Radeon_SetColourKey( ai, &overlay->ow );
        
        // overlay unit can only handle up to 175 MHz; if pixel clock is higher,
        // only every second pixel is handled
        // (this devider is gets written into PLL by InitOverlay,
        //  so we don't need to do it ourself)
        if( crtc->mode.timing.pixel_clock < 175000 )
                ecp_div = 0;
        else
                ecp_div = 1;


        // scaling is independant of clipping, get this first
        {
                uint32 src_width, src_height;

                src_width = overlay->ov.width;
                src_height = overlay->ov.height;
        
                // this is for graphics card
                v_inc = (src_height << 20) / overlay->ow.height;
                h_inc = (src_width << (12 + ecp_div)) / overlay->ow.width;
                
        
                // this is for us       
                src_v_inc = (src_height << 16) / overlay->ow.height;
                src_h_inc = (src_width << 16) / overlay->ow.width;
        }
        
        // calculate unclipped position/size
        // TBD: I assume that overlay_window.offset_xyz is only a hint where 
        //      no overlay is visible; another interpretation were to zoom 
        //      the overlay so it fits into remaining space
        src_left = (overlay->ov.h_start << 16) + overlay->ow.offset_left * src_h_inc;
        src_top = (overlay->ov.v_start << 16) + overlay->ow.offset_top * src_v_inc;
        src_right = ((overlay->ov.h_start + overlay->ov.width) << 16) - 
                overlay->ow.offset_right * src_h_inc;
        src_bottom = ((overlay->ov.v_start + overlay->ov.height) << 16) - 
                overlay->ow.offset_top * src_v_inc;
        dest_left = overlay->ow.h_start + overlay->ow.offset_left;
        dest_top = overlay->ow.v_start + overlay->ow.offset_top;
        dest_right = overlay->ow.h_start + overlay->ow.width - overlay->ow.offset_right;
        dest_bottom = overlay->ow.v_start + overlay->ow.height - overlay->ow.offset_bottom;
        
        SHOW_FLOW( 3, "ow: h=%d, v=%d, width=%d, height=%d",
                overlay->ow.h_start, overlay->ow.v_start, 
                overlay->ow.width, overlay->ow.height );
                
        SHOW_FLOW( 3, "offset_left=%d, offset_right=%d, offset_top=%d, offset_bottom=%d", 
                overlay->ow.offset_left, overlay->ow.offset_right, 
                overlay->ow.offset_top, overlay->ow.offset_bottom );

        
        // apply virtual screen
        dest_left -= vc->mode.h_display_start + crtc->rel_x;
        dest_top -= vc->mode.v_display_start + crtc->rel_y;
        dest_right -= vc->mode.h_display_start + crtc->rel_x;
        dest_bottom -= vc->mode.v_display_start + crtc->rel_y;

        // clip to visible area
        if( dest_left < 0 ) {
                src_left += -dest_left * src_h_inc;
                dest_left = 0;
        }
        if( dest_top < 0 ) {
                src_top += -dest_top * src_v_inc;
                dest_top = 0;
        }
        
        SHOW_FLOW( 3, "mode: w=%d, h=%d", 
                crtc->mode.timing.h_display, crtc->mode.timing.v_display );
        
        if( dest_right > crtc->mode.timing.h_display )
                dest_right = crtc->mode.timing.h_display;
        if( dest_bottom > crtc->mode.timing.v_display )
                dest_bottom = crtc->mode.timing.v_display;

        SHOW_FLOW( 3, "src=(%d, %d, %d, %d)", 
                src_left, src_top, src_right, src_bottom );
        SHOW_FLOW( 3, "dest=(%d, %d, %d, %d)", 
                dest_left, dest_top, dest_right, dest_bottom );


        // especially with multi-screen modes the overlay may not be on screen at all
        if( dest_left >= dest_right || dest_top >= dest_bottom ||
                src_left >= src_right || src_top >= src_bottom )
        {
                Radeon_TempHideOverlay( ai );
                goto done;
        }
        

        // let's calculate all those nice register values
        SHOW_FLOW( 3, "ati_space=%d", node->ati_space );
        params = &space_params_table[node->ati_space];

        // choose proper scaler
        {
                factors = getHScaleFactor( ai, params, src_left >> 16, src_right >> 16, &h_inc );
                if( factors == NULL )
                        return B_ERROR;
                        
                p1_h_inc = factors->p1_step_by > 0 ? 
                        h_inc >> (factors->p1_step_by - 1) : h_inc;
                p23_h_inc = 
                        (factors->p23_step_by > 0 ? h_inc >> (factors->p23_step_by - 1) : h_inc) 
                        >> params->h_uv_sub_sample_shift;
                
                SHOW_FLOW( 3, "p1_h_inc=%x, p23_h_inc=%x", p1_h_inc, p23_h_inc );
        }

        // get register value for start/end position of overlay image (pixel-precise only)
        {
                uint32 p1_step_size, p23_step_size;
                uint32 p1_left, p1_right, p1_width;
                uint32 p23_left, p23_right, p23_width;
                
                p1_left = src_left >> 16;
                p1_right = src_right >> 16;
                p1_width = p1_right - p1_left;
                
                p1_step_size = factors->p1_step_by > 0 ? (1 << (factors->p1_step_by - 1)) : 1;
                p1_x_start = p1_left % (16 >> params->bpp_shift);
                p1_x_end = ((p1_x_start + p1_width - 1) / p1_step_size) * p1_step_size;
                
                SHOW_FLOW( 3, "p1_x_start=%d, p1_x_end=%d", p1_x_start, p1_x_end );
        
                p23_left = (src_left >> 16) >> params->h_uv_sub_sample_shift;
                p23_right = (src_right >> 16) >> params->h_uv_sub_sample_shift;
                p23_width = p23_right - p23_left;
        
                p23_step_size = factors->p23_step_by > 0 ? (1 << (factors->p23_step_by - 1)) : 1;
                // if resolution of Y and U/V differs but YUV are stored in one 
                // plane then UV alignment depends on Y data, therefore the hack
                // (you are welcome to replace this with some cleaner code ;)
                p23_x_start = p23_left % 
                        ((16 >> params->bpuv_shift) / 
                         (node->ati_space == 11 || node->ati_space == 12 ? 2 : 1));
                p23_x_end = (int)((p23_x_start + p23_width - 1) / p23_step_size) * p23_step_size;
                
                SHOW_FLOW( 3, "p23_x_start=%d, p23_x_end=%d", p23_x_start, p23_x_end );
                
                // get memory location of first word to be read by scaler
                // (save relative offset for fast update)
                si->active_overlay.rel_offset = (src_top >> 16) * node->buffer.bytes_per_row + 
                        ((p1_left << params->bpp_shift) & ~0xf);
                offset = node->mem_offset + si->active_overlay.rel_offset;
                
                SHOW_FLOW( 3, "rel_offset=%x", si->active_overlay.rel_offset );
        }
        
        // get active lines for scaler
        // (we could add additional blank lines for DVD letter box mode,
        //  but this is not supported by API; additionally, this only makes
        //  sense if want to put subtitles onto the black border, which is
        //  supported neither)
        {
                uint16 int_top, int_bottom;
                
                int_top = src_top >> 16;
                int_bottom = (src_bottom >> 16);
                
                p1_active_lines = int_bottom - int_top - 1;
                p23_active_lines = 
                        ceilShiftDiv( int_bottom - 1, params->v_uv_sub_sample_shift ) - 
                        (int_top >> params->v_uv_sub_sample_shift);
                        
                SHOW_FLOW( 3, "p1_active_lines=%d, p23_active_lines=%d", 
                        p1_active_lines, p23_active_lines );
        }
        
        // if picture is stretched for flat panel, we need to scale all
        // vertical values accordingly
        // TBD: there is no description at all concerning this, so v_accum_init may
        //      need to be initialized based on original value
        {
                if( (crtc->active_displays & (dd_lvds | dd_dvi)) != 0 ) {
                        uint64 v_ratio;
                        
                        // convert 32.32 format to 16.16 format; else we 
                        // cannot multiply two fixed point values without
                        // overflow
                        v_ratio = si->flatpanels[crtc->flatpanel_port].v_ratio >> (FIX_SHIFT - 16);
                        
                        v_inc = (v_inc * v_ratio) >> 16;
                }
                
                SHOW_FLOW( 3, "v_inc=%x", v_inc );
        }
        
        // get initial horizontal scaler values, taking care of precharge
        // don't ask questions about formulas - take them as is
        // (TBD: home-brewed sub-pixel source clipping may be wrong, 
        //       especially for uv-planes)
        {
                uint32 p23_group_size;

            tmp = ((src_left & 0xffff) >> 11) + (
                (
                        I2FF( p1_x_start % factors->group_size, 12 ) + 
                        I2FF( 2.5, 12 ) + 
                        p1_h_inc / 2 +
                        I2FF( 0.5, 12-5 )       // rounding
                ) >> (12 - 5)); // scaled by 1 << 5
                
            SHOW_FLOW( 3, "p1_h_accum_init=%x", tmp );
        
                p1_h_accum_init = 
                        ((tmp << 15) & RADEON_OV0_P1_H_ACCUM_INIT_MASK) |
                        ((tmp << 23) & RADEON_OV0_P1_PRESHIFT_MASK);
                
                
                p23_group_size = 2;
                
                tmp = ((src_left & 0xffff) >> 11) + (
                        (
                                I2FF( p23_x_start % p23_group_size, 12 ) + 
                                I2FF( 2.5, 12 ) +
                                p23_h_inc / 2 +
                                I2FF( 0.5, 12-5 )       // rounding 
                        ) >> (12 - 5)); // scaled by 1 << 5
        
                SHOW_FLOW( 3, "p23_h_accum_init=%x", tmp );
        
                p23_h_accum_init = 
                        ((tmp << 15) & RADEON_OV0_P23_H_ACCUM_INIT_MASK) |
                        ((tmp << 23) & RADEON_OV0_P23_PRESHIFT_MASK);
        }

        // get initial vertical scaler values, taking care of precharge
        {
                uint extra_full_line;

                extra_full_line = factors->p1_step_by == 0 ? 1 : 0;
        
            tmp = ((src_top & 0x0000ffff) >> 11) + (
                (min( 
                        I2FF( 1.5, 20 ) + I2FF( extra_full_line, 20 ) + v_inc / 2, 
                        I2FF( 2.5, 20 ) + 2 * I2FF( extra_full_line, 20 )
                 ) + I2FF( 0.5, 20-5 )) // rounding
                >> (20 - 5)); // scaled by 1 << 5
                
            SHOW_FLOW( 3, "p1_v_accum_init=%x", tmp );
        
                p1_v_accum_init = 
                        ((tmp << 15) & RADEON_OV0_P1_V_ACCUM_INIT_MASK) | 0x00000001;

        
                extra_full_line = factors->p23_step_by == 0 ? 1 : 0;
        
                if( params->v_uv_sub_sample_shift > 0 ) {
                        tmp = ((src_top & 0x0000ffff) >> 11) + (
                                (min( 
                                        I2FF( 1.5, 20 ) + 
                                                I2FF( extra_full_line, 20 ) + 
                                                ((v_inc / 2) >> params->v_uv_sub_sample_shift), 
                                        I2FF( 2.5, 20 ) + 
                                                2 * I2FF( extra_full_line, 20 )
                                 ) + I2FF( 0.5, 20-5 )) // rounding
                                >> (20 - 5)); // scaled by 1 << 5
                } else {
                        tmp = ((src_top & 0x0000ffff) >> 11) + (
                                (
                                        I2FF( 2.5, 20 ) + 
                                        2 * I2FF( extra_full_line, 20 ) +
                                        I2FF( 0.5, 20-5 )       // rounding
                                ) >> (20 - 5)); // scaled by 1 << 5
                }
                
                SHOW_FLOW( 3, "p23_v_accum_init=%x", tmp );
        
                p23_v_accum_init = 
                        ((tmp << 15) & RADEON_OV0_P23_V_ACCUM_INIT_MASK) | 0x00000001;          
        }

        // show me what you've got!
        // we could lock double buffering of overlay unit during update
        // (new values are copied during vertical blank, so if we've updated
        // only some of them, you get a whole frame of mismatched values)
        // but during tests I couldn't get the artifacts go away, so
        // we use the dangerous way which has the pro to not require any
        // waiting
        
        // let's try to lock overlay unit
        // we had to wait now until the lock takes effect, but this is
        // impossible with CCE; perhaps we have to convert this code to 
        // direct register access; did that - let's see what happens...
        OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, RADEON_REG_LD_CTL_LOCK );
        
        // wait until register access is locked
        while( (INREG( regs, RADEON_OV0_REG_LOAD_CNTL) 
                & RADEON_REG_LD_CTL_LOCK_READBACK) == 0 )
                ;
        
        OUTREG( regs, RADEON_OV0_VID_BUF0_BASE_ADRS, offset );
        OUTREG( regs, RADEON_OV0_VID_BUF_PITCH0_VALUE, node->buffer.bytes_per_row );
        OUTREG( regs, RADEON_OV0_H_INC, p1_h_inc | (p23_h_inc << 16) );
        OUTREG( regs, RADEON_OV0_STEP_BY, factors->p1_step_by | (factors->p23_step_by << 8) );
        OUTREG( regs, RADEON_OV0_V_INC, v_inc );
        
        OUTREG( regs,
                crtc->crtc_idx == 0 ? RADEON_OV0_Y_X_START : RADEON_OV1_Y_X_START, 
                (dest_left) | (dest_top << 16) );
        OUTREG( regs, 
                crtc->crtc_idx == 0 ? RADEON_OV0_Y_X_END : RADEON_OV1_Y_X_END,
                (dest_right - 1) | ((dest_bottom - 1) << 16) );

        OUTREG( regs, RADEON_OV0_P1_BLANK_LINES_AT_TOP, 
                RADEON_P1_BLNK_LN_AT_TOP_M1_MASK | (p1_active_lines << 16) );
        OUTREG( regs, RADEON_OV0_P1_X_START_END, p1_x_end | (p1_x_start << 16) );
        OUTREG( regs, RADEON_OV0_P1_H_ACCUM_INIT, p1_h_accum_init );
        OUTREG( regs, RADEON_OV0_P1_V_ACCUM_INIT, p1_v_accum_init );
        
        OUTREG( regs, RADEON_OV0_P23_BLANK_LINES_AT_TOP, 
                RADEON_P23_BLNK_LN_AT_TOP_M1_MASK | (p23_active_lines << 16) );
        OUTREG( regs, RADEON_OV0_P2_X_START_END, 
                p23_x_end | (p23_x_start << 16) );
        OUTREG( regs, RADEON_OV0_P3_X_START_END, 
                p23_x_end | (p23_x_start << 16) );
        OUTREG( regs, RADEON_OV0_P23_H_ACCUM_INIT, p23_h_accum_init );
        OUTREG( regs, RADEON_OV0_P23_V_ACCUM_INIT, p23_v_accum_init );
        
        OUTREG( regs, RADEON_OV0_TEST, node->test_reg );
        
        scale_ctrl = RADEON_SCALER_ENABLE | 
                RADEON_SCALER_DOUBLE_BUFFER | 
                (node->ati_space << 8) | 
                /* RADEON_SCALER_ADAPTIVE_DEINT | */
                RADEON_SCALER_BURST_PER_PLANE |
                (crtc->crtc_idx == 0 ? 0 : RADEON_SCALER_CRTC_SEL );
                
        switch (node->ati_space << 8) {
                case RADEON_SCALER_SOURCE_15BPP: // RGB15
                case RADEON_SCALER_SOURCE_16BPP:
                case RADEON_SCALER_SOURCE_32BPP:
                        OUTREG( regs, RADEON_OV0_SCALE_CNTL, scale_ctrl | 
                                                        RADEON_SCALER_LIN_TRANS_BYPASS);
                        break;
                case RADEON_SCALER_SOURCE_VYUY422: // VYUY422
                case RADEON_SCALER_SOURCE_YVYU422: // YVYU422
                        OUTREG( regs, RADEON_OV0_SCALE_CNTL, scale_ctrl);
                        break;
                default:
                        SHOW_FLOW(4, "What overlay format is this??? %d", node->ati_space);
                        OUTREG( regs, RADEON_OV0_SCALE_CNTL, scale_ctrl |
                         (( ai->si->asic >= rt_r200) ? R200_SCALER_TEMPORAL_DEINT : 0));
                
        }
        
        si->overlay_mgr.auto_flip_reg ^= RADEON_OV0_SOFT_EOF_TOGGLE;
        
        OUTREG( regs, RADEON_OV0_AUTO_FLIP_CNTRL, 
                si->overlay_mgr.auto_flip_reg );
        
        OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, 0 );
        
done:
        ai->si->active_overlay.on = ai->si->pending_overlay.on;
        ai->si->active_overlay.ow = ai->si->pending_overlay.ow;
        ai->si->active_overlay.ov = ai->si->pending_overlay.ov;
        ai->si->active_overlay.ob = ai->si->pending_overlay.ob;
        ai->si->active_overlay.h_display_start = vc->mode.h_display_start;
        ai->si->active_overlay.v_display_start = vc->mode.v_display_start;

        return B_OK;
}


// hide overlay, but not permanently
void Radeon_TempHideOverlay( 
        accelerator_info *ai )
{
        SHOW_FLOW0( 3, "" );

        OUTREG( ai->regs, RADEON_OV0_SCALE_CNTL, 0 );
}


// hide overlay (can be called even if there is none visible)
void Radeon_HideOverlay( 
        accelerator_info *ai )
{
        shared_info *si = ai->si;
        
        Radeon_TempHideOverlay( ai );

        // remember that there is no overlay to be shown        
        si->active_overlay.on = NULL;
        si->active_overlay.prev_on = NULL;
        si->pending_overlay.on = NULL;
        
        // invalidate active head so it will be setup again once
        // a new overlay is shown
        si->active_overlay.crtc_idx = -1;
}


// show new overlay buffer with same parameters as last one
static void Radeon_ReplaceOverlayBuffer( 
        accelerator_info *ai )
{
#if 0
        shared_info *si = ai->si;
        vuint8 *regs = ai->regs;
        uint32 offset;
        int /*old_buf, */new_buf;
        
        offset = si->pending_overlay.on->mem_offset + si->active_overlay.rel_offset;

        /*old_buf = si->overlay_mgr.auto_flip_reg & RADEON_OV0_SOFT_BUF_NUM_MASK;
        new_buf = old_buf == 0 ? 3 : 0;
        si->overlay_mgr.auto_flip_reg &= ~RADEON_OV0_SOFT_BUF_NUM_MASK;
        si->overlay_mgr.auto_flip_reg |= new_buf;*/
        new_buf = 0;
        
        // lock overlay registers
/*      OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, RADEON_REG_LD_CTL_LOCK );
        
        // wait until register access is locked
        while( (INREG( regs, RADEON_OV0_REG_LOAD_CNTL) 
                & RADEON_REG_LD_CTL_LOCK_READBACK) == 0 )
                ;*/
        
        // setup new buffer
        /*OUTREG( regs, 
                new_buf == 0 ? RADEON_OV0_VID_BUF_PITCH0_VALUE : RADEON_OV0_VID_BUF_PITCH1_VALUE, 
                si->pending_overlay.on->buffer.bytes_per_row );*/
        OUTREG( regs, 
                new_buf == 0 ? RADEON_OV0_VID_BUF0_BASE_ADRS : RADEON_OV0_VID_BUF3_BASE_ADRS, 
                offset | (new_buf == 0 ? 0 : RADEON_VIF_BUF0_PITCH_SEL));
        
        // make changes visible 
        si->overlay_mgr.auto_flip_reg ^= RADEON_OV0_SOFT_EOF_TOGGLE;
        
        OUTREG( regs, RADEON_OV0_AUTO_FLIP_CNTRL, si->overlay_mgr.auto_flip_reg );
        
        // unlock overlay registers
//      OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, 0 );

        ai->si->active_overlay.on = ai->si->pending_overlay.on;
#else
        shared_info *si = ai->si;
        uint32 offset;
        
        if ( ai->si->acc_dma )
        {
                START_IB();
        
                offset = si->pending_overlay.on->mem_offset + si->active_overlay.rel_offset;
                
                WRITE_IB_REG( RADEON_OV0_VID_BUF0_BASE_ADRS, offset);
                
                si->overlay_mgr.auto_flip_reg ^= RADEON_OV0_SOFT_EOF_TOGGLE;
                WRITE_IB_REG( RADEON_OV0_AUTO_FLIP_CNTRL, si->overlay_mgr.auto_flip_reg );
                
                SUBMIT_IB();
        } else {
                Radeon_WaitForFifo( ai, 2 );
                offset = si->pending_overlay.on->mem_offset + si->active_overlay.rel_offset;
                
                OUTREG( ai->regs, RADEON_OV0_VID_BUF0_BASE_ADRS, offset);
                
                si->overlay_mgr.auto_flip_reg ^= RADEON_OV0_SOFT_EOF_TOGGLE;
                OUTREG( ai->regs, RADEON_OV0_AUTO_FLIP_CNTRL, si->overlay_mgr.auto_flip_reg );
        }       
        ai->si->active_overlay.on = ai->si->pending_overlay.on;
#endif
}


// get number of pixels of overlay shown on virtual port
static int getIntersectArea( 
        accelerator_info *ai, overlay_window *ow, crtc_info *crtc )
{
        virtual_card *vc = ai->vc;
        int left, top, right, bottom;
        
        left = ow->h_start - (vc->mode.h_display_start + crtc->rel_x);
        top = ow->v_start - (vc->mode.v_display_start + crtc->rel_y);
        right = left + ow->width;
        bottom = top + ow->height;
        
        if( left < 0 )
                left = 0;
        if( top < 0 )
                top = 0;
        if( right > crtc->mode.timing.h_display )
                right = crtc->mode.timing.h_display;
        if( bottom > crtc->mode.timing.v_display )
                bottom = crtc->mode.timing.v_display;
                
        if( right < left || bottom < top )
                return 0;
                
        return (right - left) * (bottom - top);
}


// update overlay, to be called whenever something in terms of 
// overlay have or can have been changed
status_t Radeon_UpdateOverlay( 
        accelerator_info *ai )
{
        virtual_card *vc = ai->vc;
        shared_info *si = ai->si;
        int crtc_idx;

        float brightness = 0.0f;
        float contrast = 1.0f;
        float saturation = 1.0f;
        float hue = 0.0f;
        int32 ref = 0;

        SHOW_FLOW0( 3, "" );

        // don't mess around with overlay of someone else
        if ( !vc->uses_overlay )
                return B_OK;

        // make sure there really is an overlay
        if ( si->pending_overlay.on == NULL )
                return B_OK;

        // verify that the overlay is still valid
        if ((uintptr_t)si->pending_overlay.ot != si->overlay_mgr.token )
                return B_BAD_VALUE;

        if ( vc->different_heads > 1 ) {
                int area0, area1;

                // determine on which port most of the overlay is shown
                area0 = getIntersectArea( ai, &si->pending_overlay.ow, &si->crtc[0] );
                area1 = getIntersectArea( ai, &si->pending_overlay.ow, &si->crtc[0] );

                SHOW_FLOW( 3, "area0=%d, area1=%d", area0, area1 );

                if (area0 >= area1 )
                        crtc_idx = 0;
                else
                        crtc_idx = 1;

        } else if ( vc->independant_heads > 1 ) {
                // both ports show the same, use "swap displays" to decide
                // where to show the overlay (to be improved as this flag isn't
                // really designed for that)
                if ( vc->swap_displays )
                        crtc_idx = 1;
                else
                        crtc_idx = 0;
                        
        } else {
        
                // one crtc used only - pick the one that we use
                crtc_idx = vc->used_crtc[0] ? 0 : 1;
        }
        
        si->pending_overlay.crtc_idx = crtc_idx;

        // only update registers that have been changed to minimize work
        if( si->active_overlay.crtc_idx != si->pending_overlay.crtc_idx ) {
                Radeon_InitOverlay( ai, crtc_idx );
        } 
        
        if( si->active_overlay.ob.space != si->pending_overlay.ob.space ) {
                Radeon_SetTransform( ai, brightness, contrast, saturation, hue, 0, 0, 0, ref );
        }

        if( memcmp( &si->active_overlay.ow, &si->pending_overlay.ow, sizeof( si->active_overlay.ow )) != 0 || 
                memcmp( &si->active_overlay.ov, &si->pending_overlay.ov, sizeof( si->active_overlay.ov )) != 0 ||
                si->active_overlay.h_display_start != vc->mode.h_display_start ||
                si->active_overlay.v_display_start != vc->mode.v_display_start ||
                si->active_overlay.ob.width != si->pending_overlay.ob.width ||
                si->active_overlay.ob.height != si->pending_overlay.ob.height ||
                si->active_overlay.ob.bytes_per_row != si->pending_overlay.ob.bytes_per_row )
                Radeon_ShowOverlay( ai, crtc_idx );
                
        else if( si->active_overlay.on != si->pending_overlay.on )
                Radeon_ReplaceOverlayBuffer( ai );
                
        SHOW_FLOW0( 3, "success" );
        
        return B_OK;
}