root/src/add-ons/accelerants/intel_extreme/overlay.cpp
/*
 * Copyright 2006-2009, Haiku, Inc. All Rights Reserved.
 * Distributed under the terms of the MIT License.
 *
 * Authors:
 *              Axel Dörfler, axeld@pinc-software.de
 *
 * The phase coefficient computation was taken from the X driver written by
 * Alan Hourihane and David Dawes.
 */


#include "accelerant.h"
#include "accelerant_protos.h"
#include "commands.h"

#include <Debug.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>

#include <AGP.h>


#undef TRACE
//#define TRACE_OVERLAY
#ifdef TRACE_OVERLAY
#       define TRACE(x...) _sPrintf("intel_extreme: " x)
#else
#       define TRACE(x...)
#endif

#define ERROR(x...) _sPrintf("intel_extreme: " x)
#define CALLED(x...) TRACE("CALLED %s\n", __PRETTY_FUNCTION__)


#define NUM_HORIZONTAL_TAPS             5
#define NUM_VERTICAL_TAPS               3
#define NUM_HORIZONTAL_UV_TAPS  3
#define NUM_VERTICAL_UV_TAPS    3
#define NUM_PHASES                              17
#define MAX_TAPS                                5

struct phase_coefficient {
        uint8   sign;
        uint8   exponent;
        uint16  mantissa;
};


/*!     Splits the coefficient floating point value into the 3 components
        sign, mantissa, and exponent.
*/
static bool
split_coefficient(double &coefficient, int32 mantissaSize,
        phase_coefficient &splitCoefficient)
{
        double absCoefficient = fabs(coefficient);

        int sign;
        if (coefficient < 0.0)
                sign = 1;
        else
                sign = 0;

        int32 intCoefficient, res;
        int32 maxValue = 1 << mantissaSize;
        res = 12 - mantissaSize;

        if ((intCoefficient = (int)(absCoefficient * 4 * maxValue + 0.5))
                        < maxValue) {
                splitCoefficient.exponent = 3;
                splitCoefficient.mantissa = intCoefficient << res;
                coefficient = (double)intCoefficient / (double)(4 * maxValue);
        } else if ((intCoefficient = (int)(absCoefficient * 2 * maxValue + 0.5))
                        < maxValue) {
                splitCoefficient.exponent = 2;
                splitCoefficient.mantissa = intCoefficient << res;
                coefficient = (double)intCoefficient / (double)(2 * maxValue);
        } else if ((intCoefficient = (int)(absCoefficient * maxValue + 0.5))
                        < maxValue) {
                splitCoefficient.exponent = 1;
                splitCoefficient.mantissa = intCoefficient << res;
                coefficient = (double)intCoefficient / (double)maxValue;
        } else if ((intCoefficient = (int)(absCoefficient * maxValue * 0.5 + 0.5))
                        < maxValue) {
                splitCoefficient.exponent = 0;
                splitCoefficient.mantissa = intCoefficient << res;
                coefficient = (double)intCoefficient / (double)(maxValue / 2);
        } else {
                // coefficient out of range
                return false;
        }

        splitCoefficient.sign = sign;
        if (sign)
                coefficient = -coefficient;

        return true;
}


static void
update_coefficients(int32 taps, double filterCutOff, bool horizontal, bool isY,
        phase_coefficient* splitCoefficients)
{
        if (filterCutOff < 1)
                filterCutOff = 1;
        if (filterCutOff > 3)
                filterCutOff = 3;

        bool isVerticalUV = !horizontal && !isY;
        int32 mantissaSize = horizontal ? 7 : 6;

        double rawCoefficients[MAX_TAPS * 32], coefficients[NUM_PHASES][MAX_TAPS];

        int32 num = taps * 16;
        for (int32 i = 0; i < num * 2; i++) {
                double sinc;
                double value = (1.0 / filterCutOff) * taps * M_PI * (i - num)
                        / (2 * num);
                if (value == 0.0)
                        sinc = 1.0;
                else
                        sinc = sin(value) / value;

                // Hamming window
                double window = (0.5 - 0.5 * cos(i * M_PI / num));
                rawCoefficients[i] = sinc * window;
        }

        for (int32 i = 0; i < NUM_PHASES; i++) {
                // Normalise the coefficients
                double sum = 0.0;
                int32 pos;
                for (int32 j = 0; j < taps; j++) {
                        pos = i + j * 32;
                        sum += rawCoefficients[pos];
                }
                for (int32 j = 0; j < taps; j++) {
                        pos = i + j * 32;
                        coefficients[i][j] = rawCoefficients[pos] / sum;
                }

                // split them into sign/mantissa/exponent
                for (int32 j = 0; j < taps; j++) {
                        pos = j + i * taps;

                        split_coefficient(coefficients[i][j], mantissaSize
                                + (((j == (taps - 1) / 2) && !isVerticalUV) ? 2 : 0),
                                splitCoefficients[pos]);
                }

                int32 tapAdjust[MAX_TAPS];
                tapAdjust[0] = (taps - 1) / 2;
                for (int32 j = 1, k = 1; j <= tapAdjust[0]; j++, k++) {
                        tapAdjust[k] = tapAdjust[0] - j;
                        tapAdjust[++k] = tapAdjust[0] + j;
                }

                // Adjust the coefficients
                sum = 0.0;
                for (int32 j = 0; j < taps; j++) {
                        sum += coefficients[i][j];
                }

                if (sum != 1.0) {
                        for (int32 k = 0; k < taps; k++) {
                                int32 tap2Fix = tapAdjust[k];
                                double diff = 1.0 - sum;

                                coefficients[i][tap2Fix] += diff;
                                pos = tap2Fix + i * taps;

                                split_coefficient(coefficients[i][tap2Fix], mantissaSize
                                        + (((tap2Fix == (taps - 1) / 2) && !isVerticalUV) ? 2 : 0),
                                        splitCoefficients[pos]);

                                sum = 0.0;
                                for (int32 j = 0; j < taps; j++) {
                                        sum += coefficients[i][j];
                                }
                                if (sum == 1.0)
                                        break;
                        }
                }
        }
}


static void
set_color_key(uint8 red, uint8 green, uint8 blue, uint8 redMask,
        uint8 greenMask, uint8 blueMask)
{
        overlay_registers* registers = gInfo->overlay_registers;

        registers->color_key_red = red;
        registers->color_key_green = green;
        registers->color_key_blue = blue;
        registers->color_key_mask_red = ~redMask;
        registers->color_key_mask_green = ~greenMask;
        registers->color_key_mask_blue = ~blueMask;
        registers->color_key_enabled = true;
}


static void
set_color_key(const overlay_window* window)
{
        switch (gInfo->shared_info->current_mode.space) {
                case B_CMAP8:
                        set_color_key(0, 0, window->blue.value, 0x0, 0x0, 0xff);
                        break;
                case B_RGB15:
                        set_color_key(window->red.value << 3, window->green.value << 3,
                                window->blue.value << 3, window->red.mask << 3,
                                window->green.mask << 3, window->blue.mask << 3);
                        break;
                case B_RGB16:
                        set_color_key(window->red.value << 3, window->green.value << 2,
                                window->blue.value << 3, window->red.mask << 3,
                                window->green.mask << 2, window->blue.mask << 3);
                        break;

                default:
                        set_color_key(window->red.value, window->green.value,
                                window->blue.value, window->red.mask, window->green.mask,
                                window->blue.mask);
                        break;
        }
}


static void
update_overlay(bool updateCoefficients)
{
        if (!gInfo->shared_info->overlay_active
                || gInfo->shared_info->device_type.IsModel(INTEL_MODEL_965))
                return;

        QueueCommands queue(gInfo->shared_info->primary_ring_buffer);
        queue.PutFlush();
        queue.PutWaitFor(COMMAND_WAIT_FOR_OVERLAY_FLIP);
        queue.PutOverlayFlip(COMMAND_OVERLAY_CONTINUE, updateCoefficients);

        // make sure the flip is done now
        queue.PutWaitFor(COMMAND_WAIT_FOR_OVERLAY_FLIP);
        queue.PutFlush();

        TRACE("%s: UP: %lx, TST: %lx, ST: %lx, CMD: %lx (%lx), ERR: %lx\n",
                __func__, read32(INTEL_OVERLAY_UPDATE),
                read32(INTEL_OVERLAY_TEST), read32(INTEL_OVERLAY_STATUS),
                *(((uint32*)gInfo->overlay_registers) + 0x68/4), read32(0x30168),
                read32(0x2024));
}


static void
show_overlay(void)
{
        if (gInfo->shared_info->overlay_active
                || gInfo->shared_info->device_type.IsModel(INTEL_MODEL_965))
                return;

        gInfo->shared_info->overlay_active = true;
        gInfo->overlay_registers->overlay_enabled = true;

        QueueCommands queue(gInfo->shared_info->primary_ring_buffer);
        queue.PutOverlayFlip(COMMAND_OVERLAY_ON, true);
        queue.PutFlush();

        TRACE("%s: UP: %lx, TST: %lx, ST: %lx, CMD: %lx (%lx), ERR: %lx\n",
                __func__, read32(INTEL_OVERLAY_UPDATE),
                read32(INTEL_OVERLAY_TEST), read32(INTEL_OVERLAY_STATUS),
                *(((uint32*)gInfo->overlay_registers) + 0x68/4),
                read32(0x30168), read32(0x2024));
}


static void
hide_overlay(void)
{
        if (!gInfo->shared_info->overlay_active
                || gInfo->shared_info->device_type.IsModel(INTEL_MODEL_965))
                return;

        overlay_registers* registers = gInfo->overlay_registers;

        gInfo->shared_info->overlay_active = false;
        registers->overlay_enabled = false;

        QueueCommands queue(gInfo->shared_info->primary_ring_buffer);

        // flush pending commands
        queue.PutFlush();
        queue.PutWaitFor(COMMAND_WAIT_FOR_OVERLAY_FLIP);

        // clear overlay enabled bit
        queue.PutOverlayFlip(COMMAND_OVERLAY_CONTINUE, false);
        queue.PutWaitFor(COMMAND_WAIT_FOR_OVERLAY_FLIP);

        // turn off overlay engine
        queue.PutOverlayFlip(COMMAND_OVERLAY_OFF, false);
        queue.PutWaitFor(COMMAND_WAIT_FOR_OVERLAY_FLIP);

        gInfo->current_overlay = NULL;
}


//      #pragma mark -


uint32
intel_overlay_count(const display_mode* mode)
{
        // TODO: make this depending on the amount of RAM and the screen mode
        // (and we could even have more than one when using 3D as well)
        return 1;
}


const uint32*
intel_overlay_supported_spaces(const display_mode* mode)
{
        static const uint32 kSupportedSpaces[] = {B_RGB15, B_RGB16, B_RGB32,
                B_YCbCr422, 0};
        static const uint32 kSupportedi965Spaces[] = {B_YCbCr422, 0};
        intel_shared_info &sharedInfo = *gInfo->shared_info;

        if (sharedInfo.device_type.InGroup(INTEL_GROUP_96x))
                return kSupportedi965Spaces;

        return kSupportedSpaces;
}


uint32
intel_overlay_supported_features(uint32 colorSpace)
{
        return B_OVERLAY_COLOR_KEY
                | B_OVERLAY_HORIZONTAL_FILTERING
                | B_OVERLAY_VERTICAL_FILTERING
                | B_OVERLAY_HORIZONTAL_MIRRORING;
}


const overlay_buffer* 
intel_allocate_overlay_buffer(color_space colorSpace, uint16 width,
        uint16 height)
{
        TRACE("%s(width %u, height %u, colorSpace %lu)\n", __func__, width,
                height, colorSpace);

        intel_shared_info &sharedInfo = *gInfo->shared_info;
        uint32 bytesPerPixel;

        switch (colorSpace) {
                case B_RGB15:
                        bytesPerPixel = 2;
                        break;
                case B_RGB16:
                        bytesPerPixel = 2;
                        break;
                case B_RGB32:
                        bytesPerPixel = 4;
                        break;
                case B_YCbCr422:
                        bytesPerPixel = 2;
                        break;
                default:
                        return NULL;
        }

        struct overlay* overlay = (struct overlay*)malloc(sizeof(struct overlay));
        if (overlay == NULL)
                return NULL;

        // TODO: locking!

        // alloc graphics mem

        int32 alignment = 0x3f;
        if (sharedInfo.device_type.IsModel(INTEL_MODEL_965))
                alignment = 0xff;

        overlay_buffer* buffer = &overlay->buffer;
        buffer->space = colorSpace;
        buffer->width = width;
        buffer->height = height;
        buffer->bytes_per_row = (width * bytesPerPixel + alignment) & ~alignment;

        status_t status = intel_allocate_memory(buffer->bytes_per_row * height,
                0, overlay->buffer_base);
        if (status < B_OK) {
                free(overlay);
                return NULL;
        }

        if (sharedInfo.device_type.IsModel(INTEL_MODEL_965)) {
                status = intel_allocate_memory(INTEL_i965_OVERLAY_STATE_SIZE,
                        B_APERTURE_NON_RESERVED, overlay->state_base);
                if (status < B_OK) {
                        intel_free_memory(overlay->buffer_base);
                        free(overlay);
                        return NULL;
                }

                overlay->state_offset = overlay->state_base
                        - (addr_t)gInfo->shared_info->graphics_memory;
        }

        overlay->buffer_offset = overlay->buffer_base
                - (addr_t)gInfo->shared_info->graphics_memory;

        buffer->buffer = (uint8*)overlay->buffer_base;
        buffer->buffer_dma = (uint8*)gInfo->shared_info->physical_graphics_memory
                + overlay->buffer_offset;

        TRACE("%s: base=%x, offset=%x, address=%x, physical address=%x\n",
                __func__, overlay->buffer_base, overlay->buffer_offset,
                buffer->buffer, buffer->buffer_dma);

        return buffer;
}


status_t
intel_release_overlay_buffer(const overlay_buffer* buffer)
{
        CALLED();

        struct overlay* overlay = (struct overlay*)buffer;

        // TODO: locking!

        if (gInfo->current_overlay == overlay)
                hide_overlay();

        intel_free_memory(overlay->buffer_base);
        if (gInfo->shared_info->device_type.IsModel(INTEL_MODEL_965))
                intel_free_memory(overlay->state_base);
        free(overlay);

        return B_OK;
}


status_t
intel_get_overlay_constraints(const display_mode* mode,
        const overlay_buffer* buffer, overlay_constraints* constraints)
{
        CALLED();

        // taken from the Radeon driver...

        // scaler input restrictions
        // TODO: check all these values; most of them are probably too restrictive

        // position
        constraints->view.h_alignment = 0;
        constraints->view.v_alignment = 0;

        // alignment
        switch (buffer->space) {
                case B_RGB15:
                        constraints->view.width_alignment = 7;
                        break;
                case B_RGB16:
                        constraints->view.width_alignment = 7;
                        break;
                case B_RGB32:
                        constraints->view.width_alignment = 3;
                        break;
                case B_YCbCr422:
                        constraints->view.width_alignment = 7;
                        break;
                case B_YUV12:
                        constraints->view.width_alignment = 7;
                        break;
                default:
                        return B_BAD_VALUE;
        }
        constraints->view.height_alignment = 0;

        // size
        constraints->view.width.min = 4;                // make 4-tap filter happy
        constraints->view.height.min = 4;
        constraints->view.width.max = buffer->width;
        constraints->view.height.max = buffer->height;

        // scaler output restrictions
        constraints->window.h_alignment = 0;
        constraints->window.v_alignment = 0;
        constraints->window.width_alignment = 0;
        constraints->window.height_alignment = 0;
        constraints->window.width.min = 2;
        constraints->window.width.max = mode->virtual_width;
        constraints->window.height.min = 2;
        constraints->window.height.max = mode->virtual_height;

        // TODO: the minimum values are not tested
        constraints->h_scale.min = 1.0f / (1 << 4);
        constraints->h_scale.max = buffer->width * 7;
        constraints->v_scale.min = 1.0f / (1 << 4);
        constraints->v_scale.max = buffer->height * 7;

        return B_OK;
}


overlay_token
intel_allocate_overlay(void)
{
        CALLED();

        // we only have a single overlay channel
        if (atomic_or(&gInfo->shared_info->overlay_channel_used, 1) != 0)
                return NULL;

        return (overlay_token)++gInfo->shared_info->overlay_token;
}


status_t
intel_release_overlay(overlay_token overlayToken)
{
        CALLED();

        // we only have a single token, which simplifies this
        if (overlayToken != (overlay_token)gInfo->shared_info->overlay_token)
                return B_BAD_VALUE;

        atomic_and(&gInfo->shared_info->overlay_channel_used, 0);

        return B_OK;
}


status_t
intel_configure_overlay(overlay_token overlayToken,
        const overlay_buffer* buffer, const overlay_window* window,
        const overlay_view* view)
{
        CALLED();

        if (overlayToken != (overlay_token)gInfo->shared_info->overlay_token)
                return B_BAD_VALUE;

        if (window == NULL || view == NULL) {
                hide_overlay();
                return B_OK;
        }

        struct overlay* overlay = (struct overlay*)buffer;
        overlay_registers* registers = gInfo->overlay_registers;
        intel_shared_info &sharedInfo = *gInfo->shared_info;
        bool updateCoefficients = false;
        uint32 bytesPerPixel = 2;

        switch (buffer->space) {
                case B_RGB15:
                        registers->source_format = OVERLAY_FORMAT_RGB15;
                        break;
                case B_RGB16:
                        registers->source_format = OVERLAY_FORMAT_RGB16;
                        break;
                case B_RGB32:
                        registers->source_format = OVERLAY_FORMAT_RGB32;
                        bytesPerPixel = 4;
                        break;
                case B_YCbCr422:
                        registers->source_format = OVERLAY_FORMAT_YCbCr422;
                        break;
        }

        if (!gInfo->shared_info->overlay_active
                || memcmp(&gInfo->last_overlay_view, view, sizeof(overlay_view)) != 0
                || memcmp(&gInfo->last_overlay_frame, window, sizeof(overlay_frame)) != 0) {
                // scaling has changed, program window and scaling factor

                // clip the window to on screen bounds
                // TODO: this is not yet complete or correct - especially if we start
                // to support moving the display!
                int32 left, top, right, bottom;
                left = window->h_start;
                right = window->h_start + window->width;
                top = window->v_start;
                bottom = window->v_start + window->height;
                if (left < 0)
                        left = 0;
                if (top < 0)
                        top = 0;
                if (right > sharedInfo.current_mode.timing.h_display)
                        right = sharedInfo.current_mode.timing.h_display;
                if (bottom > sharedInfo.current_mode.timing.v_display)
                        bottom = sharedInfo.current_mode.timing.v_display;
                if (left >= right || top >= bottom) {
                        // overlay is not within visible bounds
                        hide_overlay();
                        return B_OK;
                }

                registers->window_left = left;
                registers->window_top = top;
                registers->window_width = right - left;
                registers->window_height = bottom - top;

                uint32 horizontalScale = (view->width << 12) / window->width;
                uint32 verticalScale = (view->height << 12) / window->height;
                uint32 horizontalScaleUV = horizontalScale >> 1;
                uint32 verticalScaleUV = verticalScale >> 1;
                horizontalScale = horizontalScaleUV << 1;
                verticalScale = verticalScaleUV << 1;

                // we need to offset the overlay view to adapt it to the clipping
                // (in addition to whatever offset is desired already)
                left = view->h_start - (int32)((window->h_start - left)
                        * (horizontalScale / 4096.0) + 0.5);
                top = view->v_start - (int32)((window->v_start - top)
                        * (verticalScale / 4096.0) + 0.5);
                right = view->h_start + view->width;
                bottom = view->v_start + view->height;

                gInfo->overlay_position_buffer_offset = buffer->bytes_per_row * top
                        + left * bytesPerPixel;

                // Note: in non-planar mode, you *must* not program the source
                // width/height UV registers - they must stay cleared, or the chip is
                // doing strange stuff.
                // On the other hand, you have to program the UV scaling registers, or
                // the result will be wrong, too.
                registers->source_width_rgb = right - left;
                registers->source_height_rgb = bottom - top;
                if (gInfo->shared_info->device_type.InFamily(INTEL_FAMILY_8xx)) {
                        registers->source_bytes_per_row_rgb = (((overlay->buffer_offset
                                + (view->width << 1) + 0x1f) >> 5)
                                - (overlay->buffer_offset >> 5) - 1) << 2;
                } else {
                        int yaddress = overlay->buffer_offset;
                        int yswidth = view->width << 1;
                        registers->source_bytes_per_row_rgb = (((((yaddress
                                + yswidth + 0x3f) >> 6) - (yaddress >> 6)) << 1) - 1) << 2;
                }

                // horizontal scaling
                registers->scale_rgb.horizontal_downscale_factor
                        = horizontalScale >> 12;
                registers->scale_rgb.horizontal_scale_fraction
                        = horizontalScale & 0xfff;
                registers->scale_uv.horizontal_downscale_factor
                        = horizontalScaleUV >> 12;
                registers->scale_uv.horizontal_scale_fraction
                        = horizontalScaleUV & 0xfff;

                // vertical scaling
                registers->scale_rgb.vertical_scale_fraction = verticalScale & 0xfff;
                registers->scale_uv.vertical_scale_fraction = verticalScaleUV & 0xfff;
                registers->vertical_scale_rgb = verticalScale >> 12;
                registers->vertical_scale_uv = verticalScaleUV >> 12;

                TRACE("scale: h = %ld.%ld, v = %ld.%ld\n", horizontalScale >> 12,
                        horizontalScale & 0xfff, verticalScale >> 12,
                        verticalScale & 0xfff);

                if (verticalScale != gInfo->last_vertical_overlay_scale
                        || horizontalScale != gInfo->last_horizontal_overlay_scale) {
                        // Recompute phase coefficients (taken from X driver)
                        updateCoefficients = true;

                        phase_coefficient coefficients[NUM_HORIZONTAL_TAPS * NUM_PHASES];
                        update_coefficients(NUM_HORIZONTAL_TAPS, horizontalScale / 4096.0,
                                true, true, coefficients);

                        phase_coefficient coefficientsUV[
                                NUM_HORIZONTAL_UV_TAPS * NUM_PHASES];
                        update_coefficients(NUM_HORIZONTAL_UV_TAPS,
                                horizontalScaleUV / 4096.0, true, false, coefficientsUV);

                        int32 pos = 0;
                        for (int32 i = 0; i < NUM_PHASES; i++) {
                                for (int32 j = 0; j < NUM_HORIZONTAL_TAPS; j++) {
                                        registers->horizontal_coefficients_rgb[pos]
                                                = coefficients[pos].sign << 15
                                                        | coefficients[pos].exponent << 12
                                                        | coefficients[pos].mantissa;
                                        pos++;
                                }
                        }

                        pos = 0;
                        for (int32 i = 0; i < NUM_PHASES; i++) {
                                for (int32 j = 0; j < NUM_HORIZONTAL_UV_TAPS; j++) {
                                        registers->horizontal_coefficients_uv[pos]
                                                = coefficientsUV[pos].sign << 15
                                                        | coefficientsUV[pos].exponent << 12
                                                        | coefficientsUV[pos].mantissa;
                                        pos++;
                                }
                        }

                        gInfo->last_vertical_overlay_scale = verticalScale;
                        gInfo->last_horizontal_overlay_scale = horizontalScale;
                }

                gInfo->last_overlay_view = *view;
                gInfo->last_overlay_frame = *(overlay_frame*)window;
        }

        registers->color_control_output_mode = true;
        registers->select_pipe = 0;

        // program buffer

        registers->buffer_rgb0
                = overlay->buffer_offset + gInfo->overlay_position_buffer_offset;
        registers->stride_rgb = buffer->bytes_per_row;

        registers->mirroring_mode
                = (window->flags & B_OVERLAY_HORIZONTAL_MIRRORING) != 0
                        ? OVERLAY_MIRROR_HORIZONTAL : OVERLAY_MIRROR_NORMAL;
        registers->ycbcr422_order = 0;

        if (!gInfo->shared_info->overlay_active) {
                // overlay is shown for the first time
                set_color_key(window);
                show_overlay();
        } else
                update_overlay(updateCoefficients);

        gInfo->current_overlay = overlay;
        return B_OK;
}