root/usr/src/cmd/iconv/charmap.c
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */

/*
 * CHARMAP file handling for iconv.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#include <unistd.h>
#include <alloca.h>
#include <sys/avl.h>
#include <stddef.h>
#include <unistd.h>
#include "charmap.h"
#include "parser.tab.h"
#include <assert.h>

enum cmap_pass cmap_pass;
static avl_tree_t       cmap_sym;
static avl_tree_t       cmap_mbs;

typedef struct charmap {
        const char *cm_name;
        struct charmap *cm_alias_of;
        avl_node_t cm_avl_sym;
        avl_node_t cm_avl_mbs;
        int cm_warned;
        int cm_frmbs_len;
        int cm_tombs_len;
        char cm_frmbs[MB_LEN_MAX + 1]; /* input */
        char cm_tombs[MB_LEN_MAX + 1]; /* output */
} charmap_t;

static void add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups);
static void add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups);

/*
 * Array of POSIX specific portable characters.
 */
static const struct {
        char    *name;
        int     ch;
} portable_chars[] = {
        { "NUL",                 '\0' },
        { "alert",              '\a' },
        { "backspace",          '\b' },
        { "tab",                '\t' },
        { "carriage-return",    '\r' },
        { "newline",            '\n' },
        { "vertical-tab",       '\v' },
        { "form-feed",          '\f' },
        { "space",              ' ' },
        { "exclamation-mark",   '!' },
        { "quotation-mark",     '"' },
        { "number-sign",        '#' },
        { "dollar-sign",        '$' },
        { "percent-sign",       '%' },
        { "ampersand",          '&' },
        { "apostrophe",         '\'' },
        { "left-parenthesis",   '(' },
        { "right-parenthesis",  '(' },
        { "asterisk",           '*' },
        { "plus-sign",          '+' },
        { "comma",               ','},
        { "hyphen-minus",       '-' },
        { "hyphen",             '-' },
        { "full-stop",          '.' },
        { "period",             '.' },
        { "slash",              '/' },
        { "solidus",            '/' },
        { "zero",               '0' },
        { "one",                '1' },
        { "two",                '2' },
        { "three",              '3' },
        { "four",               '4' },
        { "five",               '5' },
        { "six",                '6' },
        { "seven",              '7' },
        { "eight",              '8' },
        { "nine",               '9' },
        { "colon",              ':' },
        { "semicolon",          ';' },
        { "less-than-sign",     '<' },
        { "equals-sign",        '=' },
        { "greater-than-sign",  '>' },
        { "question-mark",      '?' },
        { "commercial-at",      '@' },
        { "left-square-bracket", '[' },
        { "backslash",          '\\' },
        { "reverse-solidus",    '\\' },
        { "right-square-bracket", ']' },
        { "circumflex",         '^' },
        { "circumflex-accent",  '^' },
        { "low-line",           '_' },
        { "underscore",         '_' },
        { "grave-accent",       '`' },
        { "left-brace",         '{' },
        { "left-curly-bracket", '{' },
        { "vertical-line",      '|' },
        { "right-brace",        '}' },
        { "right-curly-bracket", '}' },
        { "tilde",              '~' },
        { "A", 'A' },
        { "B", 'B' },
        { "C", 'C' },
        { "D", 'D' },
        { "E", 'E' },
        { "F", 'F' },
        { "G", 'G' },
        { "H", 'H' },
        { "I", 'I' },
        { "J", 'J' },
        { "K", 'K' },
        { "L", 'L' },
        { "M", 'M' },
        { "N", 'N' },
        { "O", 'O' },
        { "P", 'P' },
        { "Q", 'Q' },
        { "R", 'R' },
        { "S", 'S' },
        { "T", 'T' },
        { "U", 'U' },
        { "V", 'V' },
        { "W", 'W' },
        { "X", 'X' },
        { "Y", 'Y' },
        { "Z", 'Z' },
        { "a", 'a' },
        { "b", 'b' },
        { "c", 'c' },
        { "d", 'd' },
        { "e", 'e' },
        { "f", 'f' },
        { "g", 'g' },
        { "h", 'h' },
        { "i", 'i' },
        { "j", 'j' },
        { "k", 'k' },
        { "l", 'l' },
        { "m", 'm' },
        { "n", 'n' },
        { "o", 'o' },
        { "p", 'p' },
        { "q", 'q' },
        { "r", 'r' },
        { "s", 's' },
        { "t", 't' },
        { "u", 'u' },
        { "v", 'v' },
        { "w", 'w' },
        { "x", 'x' },
        { "y", 'y' },
        { "z", 'z' },
        { NULL, 0 }
};

static int
cmap_compare_sym(const void *n1, const void *n2)
{
        const charmap_t *c1 = n1;
        const charmap_t *c2 = n2;
        int rv;

        rv = strcmp(c1->cm_name, c2->cm_name);
        return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
}

/*
 * In order for partial match searches to work,
 * we need these sorted by mbs contents.
 */
static int
cmap_compare_mbs(const void *n1, const void *n2)
{
        const charmap_t *c1 = n1;
        const charmap_t *c2 = n2;
        int len, rv;

        len = c1->cm_frmbs_len;
        if (len < c2->cm_frmbs_len)
                len = c2->cm_frmbs_len;
        rv = memcmp(c1->cm_frmbs, c2->cm_frmbs, len);
        if (rv < 0)
                return (-1);
        if (rv > 0)
                return (1);
        /* they match through length */
        if (c1->cm_frmbs_len < c2->cm_frmbs_len)
                return (-1);
        if (c2->cm_frmbs_len < c1->cm_frmbs_len)
                return (1);
        return (0);
}

void
charmap_init(char *to_map, char *from_map)
{
        avl_create(&cmap_sym, cmap_compare_sym, sizeof (charmap_t),
            offsetof(charmap_t, cm_avl_sym));

        avl_create(&cmap_mbs, cmap_compare_mbs, sizeof (charmap_t),
            offsetof(charmap_t, cm_avl_mbs));

        cmap_pass = CMAP_PASS_FROM;
        reset_scanner(from_map);
        (void) yyparse();
        add_charmap_posix();

        cmap_pass = CMAP_PASS_TO;
        reset_scanner(to_map);
        (void) yyparse();
}

void
charmap_dump()
{
        charmap_t *cm;
        int i;

        cm = avl_first(&cmap_mbs);
        while (cm != NULL) {
                (void) printf("name=\"%s\"\n", cm->cm_name);

                (void) printf("\timbs=\"");
                for (i = 0; i < cm->cm_frmbs_len; i++)
                        (void) printf("\\x%02x", cm->cm_frmbs[i] & 0xFF);
                (void) printf("\"\n");

                (void) printf("\tombs=\"");
                for (i = 0; i < cm->cm_tombs_len; i++)
                        (void) printf("\\x%02x", cm->cm_tombs[i] & 0xFF);
                (void) printf("\"\n");

                cm = AVL_NEXT(&cmap_mbs, cm);
        }
}

/*
 * We parse two charmap files:  First the "from" map, where we build
 * cmap_mbs and cmap_sym which we'll later use to translate the input
 * stream (mbs encodings) to symbols.  Second, we parse the "to" map,
 * where we fill in the tombs members of entries in cmap_sym, (which
 * must alread exist) used later to write the output encoding.
 */
static void
add_charmap_impl(char *sym, char *mbs, int mbs_len, int nodups)
{

        /*
         * While parsing both the "from" and "to" cmaps,
         * require both the symbol and encoding.
         */
        if (sym == NULL || mbs == NULL) {
                errf(_("invalid charmap entry"));
                return;
        }

        switch (cmap_pass) {
        case CMAP_PASS_FROM:
                add_charmap_impl_fr(sym, mbs, mbs_len, nodups);
                break;
        case CMAP_PASS_TO:
                add_charmap_impl_to(sym, mbs, mbs_len, nodups);
                break;
        default:
                abort();
                break;
        }
}

static void
add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups)
{
        charmap_t       *m, *n, *s;
        avl_index_t     where_sym, where_mbs;

        if ((n = calloc(1, sizeof (*n))) == NULL) {
                errf(_("out of memory"));
                return;
        }
        n->cm_name = sym;

        assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
        (void) memcpy(n->cm_frmbs, mbs, mbs_len);
        n->cm_frmbs_len = mbs_len;

        m = avl_find(&cmap_mbs, n, &where_mbs);
        s = avl_find(&cmap_sym, n, &where_sym);

        /*
         * If we found the symbol, this is a dup.
         */
        if (s != NULL) {
                if (nodups) {
                        warn(_("%s: duplicate character symbol"), sym);
                }
                free(n);
                return;
        }

        /*
         * If we found the mbs, the new one is an alias,
         * which we'll add _only_ to the symbol AVL.
         */
        if (m != NULL) {
                /* The new one is an alias of the original. */
                n->cm_alias_of = m;
                avl_insert(&cmap_sym, n, where_sym);
                return;
        }

        avl_insert(&cmap_sym, n, where_sym);
        avl_insert(&cmap_mbs, n, where_mbs);
}

static void
add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups)
{
        charmap_t       srch = {0};
        charmap_t       *m;

        assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);

        srch.cm_name = sym;

        m = avl_find(&cmap_sym, &srch, NULL);
        if (m == NULL) {
                if (sflag == 0)
                        warn(_("%s: symbol not found"), sym);
                return;
        }
        if (m->cm_alias_of != NULL) {
                m = m->cm_alias_of;

                /* don't warn for dups with aliases */
                if (m->cm_tombs_len != 0)
                        return;
        }

        if (m->cm_tombs_len != 0) {
                if (nodups) {
                        warn(_("%s: duplicate encoding for"), sym);
                }
                return;
        }

        (void) memcpy(m->cm_tombs, mbs, mbs_len);
        m->cm_tombs_len = mbs_len;
}

void
add_charmap(char *sym, char *mbs)
{
        /* mbs[0] is the length */
        int mbs_len = *mbs++;
        assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
        add_charmap_impl(sym, mbs, mbs_len, 1);
}


/*
 * This is called by the parser with start/end symbol strings (ssym, esym),
 * which are allocated in the scanner (T_SYMBOL) and free'd here.
 */
void
add_charmap_range(char *ssym, char *esym, char *mbs)
{
        int     ls, le;
        int     si;
        int     sn, en;
        int     i;
        int     mbs_len;
        char    tmbs[MB_LEN_MAX+1];
        char    *mb_last;

        static const char *digits = "0123456789";

        /* mbs[0] is the length */
        mbs_len = *mbs++;
        assert(0 < mbs_len && mbs_len <= MB_LEN_MAX);
        (void) memcpy(tmbs, mbs, mbs_len);
        mb_last = tmbs + mbs_len - 1;

        ls = strlen(ssym);
        le = strlen(esym);

        if (((si = strcspn(ssym, digits)) == 0) || (si == ls) ||
            (strncmp(ssym, esym, si) != 0) ||
            (strspn(ssym + si, digits) != (ls - si)) ||
            (strspn(esym + si, digits) != (le - si)) ||
            ((sn = atoi(ssym + si)) > ((en = atoi(esym + si))))) {
                errf(_("malformed charmap range"));
                return;
        }

        ssym[si] = 0;
        for (i = sn; i <= en; i++) {
                char *nn;
                (void) asprintf(&nn, "%s%0*u", ssym, ls - si, i);
                if (nn == NULL) {
                        errf(_("out of memory"));
                        return;
                }

                add_charmap_impl(nn, tmbs, mbs_len, 1);
                (*mb_last)++;
        }
        free(ssym);
        free(esym);
}

void
add_charmap_char(char *name, int c)
{
        char mbs[MB_LEN_MAX+1];

        mbs[0] = c;
        mbs[1] = '\0';
        add_charmap_impl(name, mbs, 1, 0);
}

/*
 * POSIX insists that certain entries be present, even when not in the
 * orginal charmap file.
 */
void
add_charmap_posix(void)
{
        int     i;

        for (i = 0; portable_chars[i].name; i++) {
                add_charmap_char(portable_chars[i].name, portable_chars[i].ch);
        }
}

/*
 * This is called with a buffer of (typically) MB_LEN_MAX bytes,
 * which is potentially a multi-byte symbol, but often contains
 * extra bytes. Find and return the longest match in the charmap.
 */
static charmap_t *
find_mbs(const char *mbs, int len)
{
        charmap_t srch = {0};
        charmap_t *cm = NULL;

        while (len > 0) {
                (void) memcpy(srch.cm_frmbs, mbs, len);
                srch.cm_frmbs_len = len;
                cm = avl_find(&cmap_mbs, &srch, NULL);
                if (cm != NULL)
                        break;
                len--;
        }

        return (cm);
}

/*
 * Return true if this sequence matches the initial part
 * of any sequence known in this charmap.
 */
static boolean_t
find_mbs_partial(const char *mbs, int len)
{
        charmap_t srch = {0};
        charmap_t *cm;
        avl_index_t where;

        (void) memcpy(srch.cm_frmbs, mbs, len);
        srch.cm_frmbs_len = len;
        cm = avl_find(&cmap_mbs, &srch, &where);
        if (cm != NULL) {
                /* full match - not expected, but OK */
                return (B_TRUE);
        }
        cm = avl_nearest(&cmap_mbs, where, AVL_AFTER);
        if (cm != NULL && 0 == memcmp(cm->cm_frmbs, mbs, len))
                return (B_TRUE);

        return (B_FALSE);
}

/*
 * Do like iconv(3), but with charmaps.
 */
size_t
cm_iconv(const char **iptr, size_t *ileft, char **optr, size_t *oleft)
{
        charmap_t *cm;
        int mbs_len;

        /* Ignore state reset requests. */
        if (iptr == NULL || *iptr == NULL)
                return (0);

        if (*oleft < MB_LEN_MAX) {
                errno = E2BIG;
                return ((size_t)-1);
        }

        while (*ileft > 0 && *oleft >= MB_LEN_MAX) {
                mbs_len = MB_LEN_MAX;
                if (mbs_len > *ileft)
                        mbs_len = *ileft;
                cm = find_mbs(*iptr, mbs_len);
                if (cm == NULL) {
                        if (mbs_len < MB_LEN_MAX &&
                            find_mbs_partial(*iptr, mbs_len)) {
                                /* incomplete sequence */
                                errno = EINVAL;
                        } else {
                                errno = EILSEQ;
                        }
                        return ((size_t)-1);
                }
                assert(cm->cm_frmbs_len > 0);
                if (cm->cm_tombs_len == 0) {
                        if (sflag == 0 && cm->cm_warned == 0) {
                                cm->cm_warned = 1;
                                warn(_("To-map does not encode <%s>\n"),
                                    cm->cm_name);
                        }
                        if (cflag == 0) {
                                errno = EILSEQ;
                                return ((size_t)-1);
                        }
                        /* just skip this input seq. */
                        *iptr  += cm->cm_frmbs_len;
                        *ileft -= cm->cm_frmbs_len;
                        continue;
                }

                *iptr  += cm->cm_frmbs_len;
                *ileft -= cm->cm_frmbs_len;
                (void) memcpy(*optr, cm->cm_tombs, cm->cm_tombs_len);
                *optr  += cm->cm_tombs_len;
                *oleft -= cm->cm_tombs_len;
        }

        return (0);
}