root/src/system/kernel/util/convertutf.cpp
/*
 * Copyright 2014 Jonathan Schleifer <js@webkeks.org>
 * Copyright 2014 Haiku, Inc. All rights reserved.
 *
 * Distributed under the terms of the MIT License.
 *
 * Authors:
 *              Jonathan Schleifer, js@webkeks.org
 *              John Scipione, jscipione@gmail.com
 */


#include <util/convertutf.h>


#include <ByteOrder.h>
#include <Errors.h>
#include <StorageDefs.h>


static inline size_t
glyph_length(uint32 glyph)
{
        if (glyph < 0x80)
                return 1;
        else if (glyph < 0x800)
                return 2;
        else if (glyph < 0x10000)
                return 3;
        else if (glyph < 0x110000)
                return 4;

        return 0;
}


static void
encode_glyph(uint32 glyph, size_t glyphLength, char* buffer)
{
        if (glyphLength == 1) {
                *buffer = glyph;
        } else if (glyphLength == 2) {
                *buffer++ = 0xC0 | (glyph >> 6);
                *buffer = 0x80 | (glyph & 0x3F);
        } else if (glyphLength == 3) {
                *buffer++ = 0xE0 | (glyph >> 12);
                *buffer++ = 0x80 | (glyph >> 6 & 0x3F);
                *buffer = 0x80 | (glyph & 0x3F);
        } else if (glyphLength == 4) {
                *buffer++ = 0xF0 | (glyph >> 18);
                *buffer++ = 0x80 | (glyph >> 12 & 0x3F);
                *buffer++ = 0x80 | (glyph >> 6 & 0x3F);
                *buffer = 0x80 | (glyph & 0x3F);
        }
}


static ssize_t
utf16_to_utf8(const uint16* source, size_t sourceCodeUnitCount, char* target,
        size_t targetLength, bool isLittleEndian)
{
        if (source == NULL || sourceCodeUnitCount == 0
                || target == NULL || targetLength == 0) {
                return B_BAD_VALUE;
        }

        ssize_t outLength = 0;

        for (size_t i = 0; i < sourceCodeUnitCount; i++) {
                uint32 glyph = isLittleEndian
                        ? B_LENDIAN_TO_HOST_INT32(source[i])
                        : B_BENDIAN_TO_HOST_INT32(source[i]);

                if ((glyph & 0xFC00) == 0xDC00) {
                        // missing high surrogate
                        return B_BAD_VALUE;
                }

                if ((glyph & 0xFC00) == 0xD800) {
                        if (sourceCodeUnitCount <= i + 1) {
                                // high surrogate at end of string
                                return B_BAD_VALUE;
                        }

                        uint32 low = isLittleEndian
                                ? B_LENDIAN_TO_HOST_INT32(source[i + 1])
                                : B_BENDIAN_TO_HOST_INT32(source[i + 1]);
                        if ((low & 0xFC00) != 0xDC00) {
                                // missing low surrogate
                                return B_BAD_VALUE;
                        }

                        glyph = (((glyph & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
                        i++;
                }

                size_t glyphLength = glyph_length(glyph);
                if (glyphLength == 0)
                        return B_BAD_VALUE;
                else if (outLength + glyphLength >= targetLength
                        || outLength + glyphLength >= B_FILE_NAME_LENGTH) {
                        // NUL terminate the string so the caller can use the
                        // abbreviated version in this case. Since the length
                        // isn't returned the caller will need to call strlen()
                        // to get the length of the string.
                        target[outLength] = '\0';
                        return B_NAME_TOO_LONG;
                }

                encode_glyph(glyph, glyphLength, target + outLength);
                outLength += glyphLength;
        }

        target[outLength] = '\0';

        return outLength;
}


ssize_t
utf16le_to_utf8(const uint16* source, size_t sourceCodeUnitCount,
        char* target, size_t targetLength)
{
        return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength,
                true);
}


ssize_t
utf16be_to_utf8(const uint16* source, size_t sourceCodeUnitCount,
        char* target, size_t targetLength)
{
        return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength,
                false);
}