root/drivers/tty/vt/ucs.c
// SPDX-License-Identifier: GPL-2.0
/*
 * ucs.c - Universal Character Set processing
 */

#include <linux/array_size.h>
#include <linux/bsearch.h>
#include <linux/consolemap.h>
#include <linux/minmax.h>

struct ucs_interval16 {
        u16 first;
        u16 last;
};

struct ucs_interval32 {
        u32 first;
        u32 last;
};

#include "ucs_width_table.h"

static int interval16_cmp(const void *key, const void *element)
{
        u16 cp = *(u16 *)key;
        const struct ucs_interval16 *entry = element;

        if (cp < entry->first)
                return -1;
        if (cp > entry->last)
                return 1;
        return 0;
}

static int interval32_cmp(const void *key, const void *element)
{
        u32 cp = *(u32 *)key;
        const struct ucs_interval32 *entry = element;

        if (cp < entry->first)
                return -1;
        if (cp > entry->last)
                return 1;
        return 0;
}

static bool cp_in_range16(u16 cp, const struct ucs_interval16 *ranges, size_t size)
{
        if (cp < ranges[0].first || cp > ranges[size - 1].last)
                return false;

        return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
                                interval16_cmp) != NULL;
}

static bool cp_in_range32(u32 cp, const struct ucs_interval32 *ranges, size_t size)
{
        if (cp < ranges[0].first || cp > ranges[size - 1].last)
                return false;

        return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
                                interval32_cmp) != NULL;
}

#define UCS_IS_BMP(cp)  ((cp) <= 0xffff)

/**
 * ucs_is_zero_width() - Determine if a Unicode code point is zero-width.
 * @cp: Unicode code point (UCS-4)
 *
 * Return: true if the character is zero-width, false otherwise
 */
bool ucs_is_zero_width(u32 cp)
{
        if (UCS_IS_BMP(cp))
                return cp_in_range16(cp, ucs_zero_width_bmp_ranges,
                                     ARRAY_SIZE(ucs_zero_width_bmp_ranges));
        else
                return cp_in_range32(cp, ucs_zero_width_non_bmp_ranges,
                                     ARRAY_SIZE(ucs_zero_width_non_bmp_ranges));
}

/**
 * ucs_is_double_width() - Determine if a Unicode code point is double-width.
 * @cp: Unicode code point (UCS-4)
 *
 * Return: true if the character is double-width, false otherwise
 */
bool ucs_is_double_width(u32 cp)
{
        if (UCS_IS_BMP(cp))
                return cp_in_range16(cp, ucs_double_width_bmp_ranges,
                                     ARRAY_SIZE(ucs_double_width_bmp_ranges));
        else
                return cp_in_range32(cp, ucs_double_width_non_bmp_ranges,
                                     ARRAY_SIZE(ucs_double_width_non_bmp_ranges));
}

/*
 * Structure for base with combining mark pairs and resulting recompositions.
 * Using u16 to save space since all values are within BMP range.
 */
struct ucs_recomposition {
        u16 base;       /* base character */
        u16 mark;       /* combining mark */
        u16 recomposed; /* corresponding recomposed character */
};

#include "ucs_recompose_table.h"

struct compare_key {
        u16 base;
        u16 mark;
};

static int recomposition_cmp(const void *key, const void *element)
{
        const struct compare_key *search_key = key;
        const struct ucs_recomposition *entry = element;

        /* Compare base character first */
        if (search_key->base < entry->base)
                return -1;
        if (search_key->base > entry->base)
                return 1;

        /* Base characters match, now compare combining character */
        if (search_key->mark < entry->mark)
                return -1;
        if (search_key->mark > entry->mark)
                return 1;

        /* Both match */
        return 0;
}

/**
 * ucs_recompose() - Attempt to recompose two Unicode characters into a single character.
 * @base: Base Unicode code point (UCS-4)
 * @mark: Combining mark Unicode code point (UCS-4)
 *
 * Return: Recomposed Unicode code point, or 0 if no recomposition is possible
 */
u32 ucs_recompose(u32 base, u32 mark)
{
        /* Check if characters are within the range of our table */
        if (base < UCS_RECOMPOSE_MIN_BASE || base > UCS_RECOMPOSE_MAX_BASE ||
            mark < UCS_RECOMPOSE_MIN_MARK || mark > UCS_RECOMPOSE_MAX_MARK)
                return 0;

        struct compare_key key = { base, mark };
        struct ucs_recomposition *result =
                __inline_bsearch(&key, ucs_recomposition_table,
                                 ARRAY_SIZE(ucs_recomposition_table),
                                 sizeof(*ucs_recomposition_table),
                                 recomposition_cmp);

        return result ? result->recomposed : 0;
}

/*
 * The fallback table structures implement a 2-level lookup.
 */

struct ucs_page_desc {
        u8 page;        /* Page index (high byte of code points) */
        u8 count;       /* Number of entries in this page */
        u16 start;      /* Start index in entries array */
};

struct ucs_page_entry {
        u8 offset;      /* Offset within page (0-255) */
        u8 fallback;    /* Fallback character or range start marker */
};

#include "ucs_fallback_table.h"

static int ucs_page_desc_cmp(const void *key, const void *element)
{
        u8 page = *(u8 *)key;
        const struct ucs_page_desc *entry = element;

        if (page < entry->page)
                return -1;
        if (page > entry->page)
                return 1;
        return 0;
}

static int ucs_page_entry_cmp(const void *key, const void *element)
{
        u8 offset = *(u8 *)key;
        const struct ucs_page_entry *entry = element;

        if (offset < entry->offset)
                return -1;
        if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) {
                if (offset > entry[1].offset)
                        return 1;
        } else {
                if (offset > entry->offset)
                        return 1;
        }
        return 0;
}

/**
 * ucs_get_fallback() - Get a substitution for the provided Unicode character
 * @cp: Unicode code point (UCS-4)
 *
 * Get a simpler fallback character for the provided Unicode character.
 * This is used for terminal display when corresponding glyph is unavailable.
 * The substitution may not be as good as the actual glyph for the original
 * character but still way more helpful than a squared question mark.
 *
 * Return: Fallback Unicode code point, or 0 if none is available
 */
u32 ucs_get_fallback(u32 cp)
{
        const struct ucs_page_desc *page;
        const struct ucs_page_entry *entry;
        u8 page_idx = cp >> 8, offset = cp;

        if (!UCS_IS_BMP(cp))
                return 0;

        /*
         * Full-width to ASCII mapping (covering all printable ASCII 33-126)
         * 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~)
         * We process them programmatically to reduce the table size.
         */
        if (cp >= 0xFF01 && cp <= 0xFF5E)
                return cp - 0xFF01 + 33;

        page = __inline_bsearch(&page_idx, ucs_fallback_pages,
                                ARRAY_SIZE(ucs_fallback_pages),
                                sizeof(*ucs_fallback_pages),
                                ucs_page_desc_cmp);
        if (!page)
                return 0;

        entry = __inline_bsearch(&offset, ucs_fallback_entries + page->start,
                                 page->count, sizeof(*ucs_fallback_entries),
                                 ucs_page_entry_cmp);
        if (!entry)
                return 0;

        if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER)
                entry++;
        return entry->fallback;
}