root/src/apps/showimage/Filter.cpp
/*
 * Copyright 2003-2006, Haiku, Inc. All rights reserved.
 * Copyright 2004-2005 yellowTAB GmbH. All Rights Reserved.
 * Copyright 2006 Bernd Korz. All Rights Reserved
 * Distributed under the terms of the MIT License.
 *
 * Authors:
 *              Michael Pfeiffer, laplace@haiku-os.org
 *              Ryan Leavengood, leavengood@gmail.com
 *              yellowTAB GmbH
 *              Bernd Korz
 */

#include <scheduler.h>
#include <Debug.h>
#include <Screen.h>

#include <syscalls.h>

#include "Filter.h"


// Implementation of FilterThread
FilterThread::FilterThread(Filter* filter, int32 i, int32 n,
        bool runInCurrentThread)
        :
        fFilter(filter),
        fI(i),
        fN(n)
{
        if (runInCurrentThread)
                Run();
        else {
                thread_id tid;
                tid = spawn_thread(worker_thread, "filter",
                        suggest_thread_priority(B_STATUS_RENDERING), this);
                if (tid >= 0)
                        resume_thread(tid);
                else
                        delete this;
        }
}


FilterThread::~FilterThread()
{
        fFilter->FilterThreadDone();
}


status_t
FilterThread::worker_thread(void* data)
{
        FilterThread* thread = (FilterThread*)data;
        return thread->Run();
}


status_t
FilterThread::Run()
{
        if (fI == 0) {
                BBitmap* bm;
                // create destination image in first thread
                bm = fFilter->GetBitmap();
                if (bm == NULL) {
                        fFilter->FilterThreadInitFailed();
                        return B_ERROR;
                }
                // and start other filter threads
                for (int32 i = fI + 1; i < fN; i ++) {
                        new FilterThread(fFilter, i, fN);
                }
        }
        if (fFilter->GetBitmap())
                fFilter->Run(fI, fN);

        delete this;
        return B_OK;
}

// Implementation of Filter
Filter::Filter(BBitmap* image, BMessenger listener, uint32 what)
        :
        fListener(listener),
        fWhat(what),
        fStarted(false),
        fN(0),
        fNumberOfThreads(0),
        fIsRunning(false),
        fSrcImage(image),
        fDestImageInitialized(false),
        fDestImage(NULL)
{
        fCPUCount = NumberOfActiveCPUs();

        fWaitForThreads = create_sem(0, "wait_for_threads");

        #if TIME_FILTER
        fStopWatch = NULL;
        #endif
}


Filter::~Filter()
{
        delete fDestImage;
        delete_sem(fWaitForThreads);
}


BBitmap*
Filter::GetBitmap()
{
        if (!fDestImageInitialized) {
                fDestImageInitialized = true;
                fDestImage = CreateDestImage(fSrcImage);
        }
        return fDestImage;
}


BBitmap*
Filter::DetachBitmap()
{
        BBitmap* image = fDestImage;
        fDestImage = NULL;
        return image;
}


void
Filter::Start(bool async)
{
        if (fStarted || fSrcImage == NULL) return;

        #if TIME_FILTER
                fStopWatch = new BStopWatch("Filter Time");
        #endif

        fN = NumberOfThreads();
        fNumberOfThreads = fN;
        fIsRunning = true;
        fStarted = true;

        // start first filter thread
        new FilterThread(this, 0, fN, !async);

        if (!async)
                Wait();
}


void
Filter::Wait()
{
        if (fStarted) {
                // wait for threads to exit
                while (acquire_sem_etc(fWaitForThreads, fN, 0, 0) == B_INTERRUPTED);
                // ready to start again
                fStarted = false;
        }
}


void
Filter::Stop()
{
        // tell FilterThreads to stop calculations
        fIsRunning = false;
        Wait();
}


bool
Filter::IsRunning() const
{
        return fIsRunning;
}


void
Filter::Completed()
{
}


void
Filter::FilterThreadDone()
{
        if (atomic_add(&fNumberOfThreads, -1) == 1) {
                #if TIME_FILTER
                        delete fStopWatch; fStopWatch = NULL;
                #endif
                Completed();
                if (fIsRunning)
                        fListener.SendMessage(fWhat);

                fIsRunning = false;
        }
        release_sem(fWaitForThreads);
}


void
Filter::FilterThreadInitFailed()
{
        ASSERT(fNumberOfThreads == fN);
        fNumberOfThreads = 0;
        Completed();
        fIsRunning = false;
        release_sem_etc(fWaitForThreads, fN, 0);
}


bool
Filter::IsBitmapValid(BBitmap* bitmap) const
{
        return bitmap != NULL && bitmap->InitCheck() == B_OK && bitmap->IsValid();
}


int32
Filter::NumberOfThreads()
{
        const int32 units = GetNumberOfUnits();
        int32 n;
        n = units / 32; // at least 32 units per CPU
        if (n > CPUCount())
                n = CPUCount();
        else if (n <= 0)
                n = 1; // at least one thread!

        return n;
}


BBitmap*
Filter::GetSrcImage()
{
        return fSrcImage;
}


BBitmap*
Filter::GetDestImage()
{
        return fDestImage;
}


int32
Filter::NumberOfActiveCPUs() const
{
        int count;
        system_info info;
        get_system_info(&info);
        count = info.cpu_count;
        int32 cpuCount = 0;
        for (int i = 0; i < count; i ++) {
                if (_kern_cpu_enabled(i))
                        cpuCount++;
        }
        if (cpuCount == 0)
                cpuCount = 1;

        return cpuCount;
}


// Implementation of (bilinear) Scaler
Scaler::Scaler(BBitmap* image, BRect rect, BMessenger listener, uint32 what,
        bool dither)
        :
        Filter(image, listener, what),
        fScaledImage(NULL),
        fRect(rect),
        fDither(dither)
{
}


Scaler::~Scaler()
{
        if (GetDestImage() != fScaledImage) {
                delete fScaledImage;
                fScaledImage = NULL;
        }
}


BBitmap*
Scaler::CreateDestImage(BBitmap* srcImage)
{
        if (srcImage == NULL || (srcImage->ColorSpace() != B_RGB32
                && srcImage->ColorSpace() != B_RGBA32))
                        return NULL;

        BRect dest(0, 0, fRect.IntegerWidth(), fRect.IntegerHeight());
        BBitmap* destImage = new BBitmap(dest,
                fDither ? B_CMAP8 : srcImage->ColorSpace());

        if (!IsBitmapValid(destImage)) {
                delete destImage;
                return NULL;
        }

        if (fDither)
        {
                BRect dest_rect(0, 0, fRect.IntegerWidth(), fRect.IntegerHeight());
                fScaledImage = new BBitmap(dest_rect, srcImage->ColorSpace());
                if (!IsBitmapValid(fScaledImage)) {
                        delete destImage;
                        delete fScaledImage;
                        fScaledImage = NULL;
                        return NULL;
                }
        } else
                fScaledImage = destImage;

        return destImage;
}


bool
Scaler::Matches(BRect rect, bool dither) const
{
        return fRect.IntegerWidth() == rect.IntegerWidth()
                && fRect.IntegerHeight() == rect.IntegerHeight()
                && fDither == dither;
}


// Scale bilinear using floating point calculations
typedef struct {
        intType srcColumn;
        float alpha0;
        float alpha1;
} ColumnData;


void
Scaler::ScaleBilinear(intType fromRow, int32 toRow)
{
        BBitmap* src;
        BBitmap* dest;
        intType srcW, srcH;
        intType destW, destH;
        intType x, y, i;
        ColumnData* columnData;
        ColumnData* cd;
        const uchar* srcBits;
        uchar* destBits;
        intType srcBPR, destBPR;
        const uchar* srcData;
        uchar* destDataRow;
        uchar* destData;
        const int32 kBPP = 4;

        src = GetSrcImage();
        dest = fScaledImage;

        srcW = src->Bounds().IntegerWidth();
        srcH = src->Bounds().IntegerHeight();
        destW = dest->Bounds().IntegerWidth();
        destH = dest->Bounds().IntegerHeight();

        srcBits = (uchar*)src->Bits();
        destBits = (uchar*)dest->Bits();
        srcBPR = src->BytesPerRow();
        destBPR = dest->BytesPerRow();

        columnData = new ColumnData[destW];
        cd = columnData;
        for (i = 0; i < destW; i++, cd++) {
                float column = (float)i * (float)srcW / (float)destW;
                cd->srcColumn = (intType)column;
                cd->alpha1 = column - cd->srcColumn;
                cd->alpha0 = 1.0 - cd->alpha1;
        }

        destDataRow = destBits + fromRow * destBPR;

        for (y = fromRow; IsRunning() && y <= toRow; y++, destDataRow += destBPR) {
                float row;
                intType srcRow;
                float alpha0, alpha1;

                if (destH == 0)
                        row = 0;
                else
                        row = (float)y * (float)srcH / (float)destH;

                srcRow = (intType)row;
                alpha1 = row - srcRow;
                alpha0 = 1.0 - alpha1;

                srcData = srcBits + srcRow * srcBPR;
                destData = destDataRow;

                if (y < destH) {
                        float a0, a1;
                        const uchar *a, *b, *c, *d;

                        for (x = 0; x < destW; x ++, destData += kBPP) {
                                a = srcData + columnData[x].srcColumn * kBPP;
                                b = a + kBPP;
                                c = a + srcBPR;
                                d = c + kBPP;

                                a0 = columnData[x].alpha0;
                                a1 = columnData[x].alpha1;

                                destData[0] = static_cast<uchar>(
                                                                (a[0] * a0 + b[0] * a1) * alpha0 +
                                                                (c[0] * a0 + d[0] * a1) * alpha1);
                                destData[1] = static_cast<uchar>(
                                                                (a[1] * a0 + b[1] * a1) * alpha0 +
                                                                (c[1] * a0 + d[1] * a1) * alpha1);
                                destData[2] = static_cast<uchar>(
                                                                (a[2] * a0 + b[2] * a1) * alpha0 +
                                                                (c[2] * a0 + d[2] * a1) * alpha1);
                                destData[3] = static_cast<uchar>(
                                                                (a[3] * a0 + b[3] * a1) * alpha0 +
                                                                (c[3] * a0 + d[3] * a1) * alpha1);
                        }

                        // right column
                        a = srcData + srcW * kBPP;
                        c = a + srcBPR;

                        destData[0] = static_cast<uchar>(a[0] * alpha0 + c[0] * alpha1);
                        destData[1] = static_cast<uchar>(a[1] * alpha0 + c[1] * alpha1);
                        destData[2] = static_cast<uchar>(a[2] * alpha0 + c[2] * alpha1);
                        destData[3] = static_cast<uchar>(a[3] * alpha0 + c[3] * alpha1);
                } else {
                        float a0, a1;
                        const uchar *a, *b;
                        for (x = 0; x < destW; x ++, destData += kBPP) {
                                a = srcData + columnData[x].srcColumn * kBPP;
                                b = a + kBPP;

                                a0 = columnData[x].alpha0;
                                a1 = columnData[x].alpha1;

                                destData[0] = static_cast<uchar>(a[0] * a0 + b[0] * a1);
                                destData[1] = static_cast<uchar>(a[1] * a0 + b[1] * a1);
                                destData[2] = static_cast<uchar>(a[2] * a0 + b[2] * a1);
                                destData[3] = static_cast<uchar>(a[3] * a0 + b[3] * a1);
                        }

                        // bottom, right pixel
                        a = srcData + srcW * kBPP;

                        destData[0] = a[0];
                        destData[1] = a[1];
                        destData[2] = a[2];
                        destData[3] = a[3];
                }

        }

        delete[] columnData;
}


// Scale bilinear using fixed point calculations
// Is already more than two times faster than floating point version
// on AMD Athlon 1 GHz and Dual Intel Pentium III 866 MHz.

typedef struct {
        int32 srcColumn;
        fixed_point alpha0;
        fixed_point alpha1;
} ColumnDataFP;


void
Scaler::ScaleBilinearFP(intType fromRow, int32 toRow)
{
        BBitmap* src;
        BBitmap* dest;
        intType srcW, srcH;
        intType destW, destH;
        intType x, y, i;
        ColumnDataFP* columnData;
        ColumnDataFP* cd;
        const uchar* srcBits;
        uchar* destBits;
        intType srcBPR, destBPR;
        const uchar* srcData;
        uchar* destDataRow;
        uchar* destData;
        const int32 kBPP = 4;

        src = GetSrcImage();
        dest = fScaledImage;

        srcW = src->Bounds().IntegerWidth();
        srcH = src->Bounds().IntegerHeight();
        destW = dest->Bounds().IntegerWidth();
        destH = dest->Bounds().IntegerHeight();

        srcBits = (uchar*)src->Bits();
        destBits = (uchar*)dest->Bits();
        srcBPR = src->BytesPerRow();
        destBPR = dest->BytesPerRow();

        fixed_point fpSrcW = to_fixed_point(srcW);
        fixed_point fpDestW = to_fixed_point(destW);
        fixed_point fpSrcH = to_fixed_point(srcH);
        fixed_point fpDestH = to_fixed_point(destH);

        columnData = new ColumnDataFP[destW];
        cd = columnData;
        for (i = 0; i < destW; i++, cd++) {
                fixed_point column = to_fixed_point(i) * (long_fixed_point)fpSrcW
                        / fpDestW;
                cd->srcColumn = from_fixed_point(column);
                cd->alpha1 = tail_value(column); // weigth for left pixel value
                cd->alpha0 = kFPOne - cd->alpha1; // weigth for right pixel value
        }

        destDataRow = destBits + fromRow * destBPR;

        for (y = fromRow; IsRunning() && y <= toRow; y++, destDataRow += destBPR) {
                fixed_point row;
                intType srcRow;
                fixed_point alpha0, alpha1;

                if (fpDestH == 0)
                        row = 0;
                else
                        row = to_fixed_point(y) * (long_fixed_point)fpSrcH / fpDestH;

                srcRow = from_fixed_point(row);
                alpha1 = tail_value(row); // weight for row y + 1
                alpha0 = kFPOne - alpha1; // weight for row y

                srcData = srcBits + srcRow * srcBPR;
                destData = destDataRow;

                // Need mult_correction for "outer" multiplication only
                #define I4(i) from_fixed_point(mult_correction(\
                                                        (a[i] * a0 + b[i] * a1) * alpha0 + \
                                                        (c[i] * a0 + d[i] * a1) * alpha1))
                #define V2(i) from_fixed_point(a[i] * alpha0 + c[i] * alpha1);
                #define H2(i) from_fixed_point(a[i] * a0 + b[i] * a1);

                if (y < destH) {
                        fixed_point a0, a1;
                        const uchar *a, *b, *c, *d;

                        for (x = 0; x < destW; x ++, destData += kBPP) {
                                a = srcData + columnData[x].srcColumn * kBPP;
                                b = a + kBPP;
                                c = a + srcBPR;
                                d = c + kBPP;

                                a0 = columnData[x].alpha0;
                                a1 = columnData[x].alpha1;

                                destData[0] = I4(0);
                                destData[1] = I4(1);
                                destData[2] = I4(2);
                                destData[3] = I4(3);
                        }

                        // right column
                        a = srcData + srcW * kBPP;
                        c = a + srcBPR;

                        destData[0] = V2(0);
                        destData[1] = V2(1);
                        destData[2] = V2(2);
                        destData[3] = V2(3);
                } else {
                        fixed_point a0, a1;
                        const uchar *a, *b;
                        for (x = 0; x < destW; x ++, destData += kBPP) {
                                a = srcData + columnData[x].srcColumn * kBPP;
                                b = a + kBPP;

                                a0 = columnData[x].alpha0;
                                a1 = columnData[x].alpha1;

                                destData[0] = H2(0);
                                destData[1] = H2(1);
                                destData[2] = H2(2);
                                destData[3] = H2(3);
                        }

                        // bottom, right pixel
                        a = srcData + srcW * kBPP;

                        destData[0] = a[0];
                        destData[1] = a[1];
                        destData[2] = a[2];
                        destData[3] = a[3];
                }
        }

        delete[] columnData;
}


void
Scaler::RowValues(float* sum, const uchar* src, intType srcW, intType fromX,
        intType toX, const float a0X, const float a1X, const int32 kBPP)
{
        sum[0] = a0X * src[0];
        sum[1] = a0X * src[1];
        sum[2] = a0X * src[2];

        src += kBPP;

        for (int32 x = fromX + 1; x < toX; x++, src += kBPP) {
                sum[0] += src[0];
                sum[1] += src[1];
                sum[2] += src[2];
        }

        if (toX <= srcW) {
                sum[0] += a1X * src[0];
                sum[1] += a1X * src[1];
                sum[2] += a1X * src[2];
        }
}


typedef struct {
        int32 from;
        int32 to;
        float alpha0;
        float alpha1;
} DownScaleColumnData;


void
Scaler::DownScaleBilinear(intType fromRow, int32 toRow)
{
        BBitmap* src;
        BBitmap* dest;
        intType srcW, srcH;
        intType destW, destH;
        intType x, y;
        const uchar* srcBits;
        uchar* destBits;
        intType srcBPR, destBPR;
        const uchar* srcData;
        uchar* destDataRow;
        uchar* destData;
        const int32 kBPP = 4;
        DownScaleColumnData* columnData;

        src = GetSrcImage();
        dest = fScaledImage;

        srcW = src->Bounds().IntegerWidth();
        srcH = src->Bounds().IntegerHeight();
        destW = dest->Bounds().IntegerWidth();
        destH = dest->Bounds().IntegerHeight();

        srcBits = (uchar*)src->Bits();
        destBits = (uchar*)dest->Bits();
        srcBPR = src->BytesPerRow();
        destBPR = dest->BytesPerRow();

        destDataRow = destBits + fromRow * destBPR;

        const float deltaX = (srcW + 1.0) / (destW + 1.0);
        const float deltaY = (srcH + 1.0) / (destH + 1.0);
        const float deltaXY = deltaX * deltaY;

        columnData = new DownScaleColumnData[destW + 1];
        DownScaleColumnData* cd = columnData;
        for (x = 0; x <= destW; x++, cd++) {
                const float fFromX = x * deltaX;
                const float fToX = fFromX + deltaX;

                cd->from = (intType)fFromX;
                cd->to = (intType)fToX;

                cd->alpha0 = 1.0 - (fFromX - cd->from);
                cd->alpha1 = fToX - cd->to;
        }

        for (y = fromRow; IsRunning() && y <= toRow; y ++, destDataRow += destBPR) {
                const float fFromY = y * deltaY;
                const float fToY = fFromY + deltaY;

                const intType fromY = (intType)fFromY;
                const intType toY = (intType)fToY;

                const float a0Y = 1.0 - (fFromY - fromY);
                const float a1Y = fToY - toY;

                const uchar* srcDataRow = srcBits + fromY * srcBPR;
                destData = destDataRow;

                cd = columnData;
                for (x = 0; x <= destW; x++, destData += kBPP, cd++) {
                        const intType fromX = cd->from;
                        const intType toX = cd->to;

                        const float a0X = cd->alpha0;
                        const float a1X = cd->alpha1;

                        srcData = srcDataRow + fromX * kBPP;

                        float totalSum[3];
                        float sum[3];

                        RowValues(sum, srcData, srcW, fromX, toX, a0X, a1X, kBPP);
                        totalSum[0] = a0Y * sum[0];
                        totalSum[1] = a0Y * sum[1];
                        totalSum[2] = a0Y * sum[2];

                        srcData += srcBPR;

                        for (int32 r = fromY + 1; r < toY; r++, srcData += srcBPR) {
                                RowValues(sum, srcData, srcW, fromX, toX, a0X, a1X, kBPP);
                                totalSum[0] += sum[0];
                                totalSum[1] += sum[1];
                                totalSum[2] += sum[2];
                        }

                        if (toY <= srcH) {
                                RowValues(sum, srcData, srcW, fromX, toX, a0X, a1X, kBPP);
                                totalSum[0] += a1Y * sum[0];
                                totalSum[1] += a1Y * sum[1];
                                totalSum[2] += a1Y * sum[2];
                        }

                        destData[0] = static_cast<uchar>(totalSum[0] / deltaXY);
                        destData[1] = static_cast<uchar>(totalSum[1] / deltaXY);
                        destData[2] = static_cast<uchar>(totalSum[2] / deltaXY);
                }
        }

        delete[] columnData;
}


// Flyod-Steinberg Dithering
// Filter (distribution of error to adjacent pixels, X is current pixel):
// 0 X 7
// 3 5 1

typedef struct {
        intType error[3];
} DitheringColumnData;


uchar
Scaler::Limit(intType value)
{
        if (value < 0) {
                value = 0;
        } else if (value > 255) {
                value = 255;
        }
        return value;
}


void
Scaler::Dither(int32 fromRow, int32 toRow)
{
        BBitmap* src;
        BBitmap* dest;
        intType destW;
        intType x, y;

        uchar* srcBits;
        intType srcBPR;
        uchar* srcDataRow;
        uchar* srcData;

        uchar* destBits;
        intType destBPR;
        uchar* destDataRow;
        uchar* destData;
        const int32 kBPP = 4;
        DitheringColumnData* columnData0;
        DitheringColumnData* columnData;
        DitheringColumnData* cd;
        BScreen screen;
        intType error[3], err[3];

        src = fScaledImage;
        dest = GetDestImage();

        ASSERT(src->ColorSpace() == B_RGB32 || src->ColorSpace() == B_RGBA32);
        ASSERT(dest->ColorSpace() == B_CMAP8);
        ASSERT(src->Bounds().IntegerWidth() == dest->Bounds().IntegerWidth());
        ASSERT(src->Bounds().IntegerHeight() == dest->Bounds().IntegerHeight());

        destW = dest->Bounds().IntegerWidth();

        srcBits = (uchar*)src->Bits();
        srcBPR = src->BytesPerRow();
        destBits = (uchar*)dest->Bits();
        destBPR = dest->BytesPerRow();

        // Allocate space for sentinel at left and right bounds,
        // so that columnData[-1] and columnData[destW + 1] can be safely accessed
        columnData0 = new DitheringColumnData[destW + 3];
        columnData = columnData0 + 1;

        // clear error
        cd = columnData;
        for (x = destW; x >= 0; x --, cd++) {
                cd->error[0] = cd->error[1] = cd->error[2] = 0;
        }

        srcDataRow = srcBits + fromRow * srcBPR;
        destDataRow = destBits + fromRow * destBPR;
        for (y = fromRow; IsRunning() && y <= toRow; y++, srcDataRow += srcBPR,
                destDataRow += destBPR) {
                // left to right
                error[0] = error[1] = error[2] = 0;
                srcData = srcDataRow;
                destData = destDataRow;
                for (x = 0; x <= destW; x ++, srcData += kBPP, destData += 1) {
                        rgb_color color, actualColor;
                        uint8 index;

                        color.red = Limit(srcData[2] + error[0] / 16);
                        color.green = Limit(srcData[1] + error[1] / 16);
                        color.blue = Limit(srcData[0] + error[2] / 16);
                        color.alpha = UINT8_MAX;

                        index = screen.IndexForColor(color);
                        actualColor = screen.ColorForIndex(index);

                        *destData = index;

                        err[0] = color.red - actualColor.red;
                        err[1] = color.green - actualColor.green;
                        err[2] = color.blue - actualColor.blue;

                        // distribute error
                        // get error for next pixel
                        cd = &columnData[x + 1];
                        error[0] = cd->error[0] + 7 * err[0];
                        error[1] = cd->error[1] + 7 * err[1];
                        error[2] = cd->error[2] + 7 * err[2];

                        // set error for right pixel below current pixel
                        cd->error[0] = err[0];
                        cd->error[1] = err[1];
                        cd->error[2] = err[2];

                        // add error for pixel below current pixel
                        cd--;
                        cd->error[0] += 5 * err[0];
                        cd->error[1] += 5 * err[1];
                        cd->error[2] += 5 * err[2];

                        // add error for left pixel below current pixel
                        cd--;
                        cd->error[0] += 3 * err[0];
                        cd->error[1] += 3 * err[1];
                        cd->error[2] += 3 * err[2];
                }
                // Note: Alogrithm has good results with "left to right" already
                // Optionally remove code to end of block:
                y++;
                srcDataRow += srcBPR; destDataRow += destBPR;
                if (y > toRow) break;
                // right to left
                error[0] = error[1] = error[2] = 0;
                srcData = srcDataRow + destW * kBPP;
                destData = destDataRow + destW;
                for (x = 0; x <= destW; x++, srcData -= kBPP, destData -= 1) {
                        rgb_color color, actualColor;
                        uint8 index;

                        color.red = Limit(srcData[2] + error[0] / 16);
                        color.green = Limit(srcData[1] + error[1] / 16);
                        color.blue = Limit(srcData[0] + error[2] / 16);
                        color.alpha = UINT8_MAX;

                        index = screen.IndexForColor(color);
                        actualColor = screen.ColorForIndex(index);

                        *destData = index;

                        err[0] = color.red - actualColor.red;
                        err[1] = color.green - actualColor.green;
                        err[2] = color.blue - actualColor.blue;

                        // distribute error
                        // get error for next pixel
                        cd = &columnData[x - 1];
                        error[0] = cd->error[0] + 7 * err[0];
                        error[1] = cd->error[1] + 7 * err[1];
                        error[2] = cd->error[2] + 7 * err[2];

                        // set error for left pixel below current pixel
                        cd->error[0] = err[0];
                        cd->error[1] = err[1];
                        cd->error[2] = err[2];

                        // add error for pixel below current pixel
                        cd++;
                        cd->error[0] += 5 * err[0];
                        cd->error[1] += 5 * err[1];
                        cd->error[2] += 5 * err[2];

                        // add error for right pixel below current pixel
                        cd++;
                        cd->error[0] += 3 * err[0];
                        cd->error[1] += 3 * err[1];
                        cd->error[2] += 3 * err[2];
                }
        }

        delete[] columnData0;
}


int32
Scaler::GetNumberOfUnits()
{
        return fRect.IntegerHeight() + 1;
}


void
Scaler::Run(int32 i, int32 n)
{
        int32 from, to, height, imageHeight;
        imageHeight = GetDestImage()->Bounds().IntegerHeight() + 1;
        height = imageHeight / n;
        from = i * height;
        if (i + 1 == n)
                to = imageHeight - 1;
        else
                to = from + height - 1;
        
        if (GetDestImage()->Bounds().Width() >= GetSrcImage()->Bounds().Width())
                ScaleBilinearFP(from, to);
        else
                DownScaleBilinear(from, to);

        if (fDither)
                Dither(from, to);

}


void
Scaler::Completed()
{
        if (GetDestImage() != fScaledImage)
                delete fScaledImage;

        fScaledImage = NULL;
}


// Implementation of ImageProcessor
ImageProcessor::ImageProcessor(enum operation op, BBitmap* image,
        BMessenger listener, uint32 what)
        :
        Filter(image, listener, what),
        fOp(op),
        fBPP(0),
        fWidth(0),
        fHeight(0),
        fSrcBPR(0),
        fDestBPR(0)
{
}


BBitmap*
ImageProcessor::CreateDestImage(BBitmap* /* srcImage */)
{
        color_space cs;
        BBitmap* bm;
        BRect rect;

        if (GetSrcImage() == NULL)
                return NULL;

        cs = GetSrcImage()->ColorSpace();
        fBPP = BytesPerPixel(cs);
        if (fBPP < 1)
                return NULL;

        fWidth = GetSrcImage()->Bounds().IntegerWidth();
        fHeight = GetSrcImage()->Bounds().IntegerHeight();

        if (fOp == kRotateClockwise || fOp == kRotateCounterClockwise)
                rect.Set(0, 0, fHeight, fWidth);
        else
                rect.Set(0, 0, fWidth, fHeight);

        bm = new BBitmap(rect, cs);
        if (!IsBitmapValid(bm)) {
                delete bm;
                return NULL;
        }

        fSrcBPR = GetSrcImage()->BytesPerRow();
        fDestBPR = bm->BytesPerRow();

        return bm;
}


int32
ImageProcessor::GetNumberOfUnits()
{
        return GetSrcImage()->Bounds().IntegerHeight() + 1;
}


int32
ImageProcessor::BytesPerPixel(color_space cs) const
{
        switch (cs) {
                case B_RGB32:           // fall through
                case B_RGB32_BIG:       // fall through
                case B_RGBA32:          // fall through
                case B_RGBA32_BIG:      return 4;

                case B_RGB24_BIG:       // fall through
                case B_RGB24:           return 3;

                case B_RGB16:           // fall through
                case B_RGB16_BIG:       // fall through
                case B_RGB15:           // fall through
                case B_RGB15_BIG:       // fall through
                case B_RGBA15:          // fall through
                case B_RGBA15_BIG:      return 2;

                case B_GRAY8:           // fall through
                case B_CMAP8:           return 1;
                case B_GRAY1:           return 0;
                default: return -1;
        }
}


void
ImageProcessor::CopyPixel(uchar* dest, int32 destX, int32 destY,
        const uchar* src, int32 x, int32 y)
{
        // Note: On my systems (Dual Intel P3 866MHz and AMD Athlon 1GHz),
        // replacing the multiplications below with pointer arithmethics showed
        // no speedup at all!
        dest += fDestBPR * destY + destX * fBPP;
        src += fSrcBPR * y + x * fBPP;
        // Replacing memcpy with this switch statement is slightly faster
        switch (fBPP) {
                case 4:
                        dest[3] = src[3];
                case 3:
                        dest[2] = src[2];
                case 2:
                        dest[1] = src[1];
                case 1:
                        dest[0] = src[0];
                        break;
        }
}


// Note: For B_CMAP8 InvertPixel inverts the color index not the color value!
void
ImageProcessor::InvertPixel(int32 x, int32 y, uchar* dest, const uchar* src)
{
        dest += fDestBPR * y + x * fBPP;
        src += fSrcBPR * y + x * fBPP;
        switch (fBPP) {
                case 4:
                        // dest[3] = ~src[3]; DON'T invert alpha channel
                case 3:
                        dest[2] = ~src[2];
                case 2:
                        dest[1] = ~src[1];
                case 1:
                        dest[0] = ~src[0];
                        break;
        }
}


// Note: On my systems, the operation kInvert shows a speedup on
// multiple CPUs only!
void
ImageProcessor::Run(int32 i, int32 n)
{
        int32 from, to;
        int32 height = (fHeight + 1) / n;
        from = i * height;
        if (i + 1 == n)
                to = fHeight;
        else
                to = from + height - 1;

        int32 x, y, destX, destY;
        const uchar* src = (uchar*)GetSrcImage()->Bits();
        uchar* dest = (uchar*)GetDestImage()->Bits();

        switch (fOp) {
                case kRotateClockwise:
                        for (y = from; y <= to; y++) {
                                for (x = 0; x <= fWidth; x++) {
                                        destX = fHeight - y;
                                        destY = x;
                                        CopyPixel(dest, destX, destY, src, x, y);
                                }
                        }
                        break;
                case kRotateCounterClockwise:
                        for (y = from; y <= to; y ++) {
                                for (x = 0; x <= fWidth; x ++) {
                                        destX = y;
                                        destY = fWidth - x;
                                        CopyPixel(dest, destX, destY, src, x, y);
                                }
                        }
                        break;
                case kFlipTopToBottom:
                        for (y = from; y <= to; y ++) {
                                for (x = 0; x <= fWidth; x ++) {
                                        destX = x;
                                        destY = fHeight - y;
                                        CopyPixel(dest, destX, destY, src, x, y);
                                }
                        }
                        break;
                case kFlipLeftToRight:
                        for (y = from; y <= to; y ++) {
                                for (x = 0; x <= fWidth; x ++) {
                                        destX = fWidth - x;
                                        destY = y;
                                        CopyPixel(dest, destX, destY, src, x, y);
                                }
                        }
                        break;
                case kInvert:
                        for (y = from; y <= to; y ++) {
                                for (x = 0; x <= fWidth; x ++) {
                                        InvertPixel(x, y, dest, src);
                                }
                        }
                        break;
        }
}