root/usr.bin/localedef/scanner.c
/*-
 * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
 * Copyright 2015 John Marino <draco@marino.st>
 *
 * This source code is derived from the illumos localedef command, and
 * provided under BSD-style license terms by Nexenta Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This file contains the "scanner", which tokenizes the input files
 * for localedef for processing by the higher level grammar processor.
 */
#include <sys/cdefs.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <limits.h>
#include <string.h>
#include <wchar.h>
#include <sys/types.h>
#include <assert.h>
#include "localedef.h"
#include "parser.h"

int                     com_char = '#';
int                     esc_char = '\\';
int                     mb_cur_min = 1;
int                     mb_cur_max = 1;
int                     lineno = 1;
int                     warnings = 0;
int                     is_stdin = 1;
FILE                    *input;
static int              nextline;
//static FILE           *input = stdin;
static const char       *filename = "<stdin>";
static int              instring = 0;
static int              escaped = 0;

/*
 * Token space ... grows on demand.
 */
static char *token = NULL;
static int tokidx;
static int toksz = 0;
static int hadtok = 0;

/*
 * Wide string space ... grows on demand.
 */
static wchar_t *widestr = NULL;
static int wideidx = 0;
static int widesz = 0;

/*
 * The last keyword seen.  This is useful to trigger the special lexer rules
 * for "copy" and also collating symbols and elements.
 */
int     last_kw = 0;
static int      category = T_END;

static struct token {
        int id;
        const char *name;
} keywords[] = {
        { T_COM_CHAR,           "comment_char" },
        { T_ESC_CHAR,           "escape_char" },
        { T_END,                "END" },
        { T_COPY,               "copy" },
        { T_MESSAGES,           "LC_MESSAGES" },
        { T_YESSTR,             "yesstr" },
        { T_YESEXPR,            "yesexpr" },
        { T_NOSTR,              "nostr" },
        { T_NOEXPR,             "noexpr" },
        { T_MONETARY,           "LC_MONETARY" },
        { T_INT_CURR_SYMBOL,    "int_curr_symbol" },
        { T_CURRENCY_SYMBOL,    "currency_symbol" },
        { T_MON_DECIMAL_POINT,  "mon_decimal_point" },
        { T_MON_THOUSANDS_SEP,  "mon_thousands_sep" },
        { T_POSITIVE_SIGN,      "positive_sign" },
        { T_NEGATIVE_SIGN,      "negative_sign" },
        { T_MON_GROUPING,       "mon_grouping" },
        { T_INT_FRAC_DIGITS,    "int_frac_digits" },
        { T_FRAC_DIGITS,        "frac_digits" },
        { T_P_CS_PRECEDES,      "p_cs_precedes" },
        { T_P_SEP_BY_SPACE,     "p_sep_by_space" },
        { T_N_CS_PRECEDES,      "n_cs_precedes" },
        { T_N_SEP_BY_SPACE,     "n_sep_by_space" },
        { T_P_SIGN_POSN,        "p_sign_posn" },
        { T_N_SIGN_POSN,        "n_sign_posn" },
        { T_INT_P_CS_PRECEDES,  "int_p_cs_precedes" },
        { T_INT_N_CS_PRECEDES,  "int_n_cs_precedes" },
        { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
        { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
        { T_INT_P_SIGN_POSN,    "int_p_sign_posn" },
        { T_INT_N_SIGN_POSN,    "int_n_sign_posn" },
        { T_COLLATE,            "LC_COLLATE" },
        { T_COLLATING_SYMBOL,   "collating-symbol" },
        { T_COLLATING_ELEMENT,  "collating-element" },
        { T_FROM,               "from" },
        { T_ORDER_START,        "order_start" },
        { T_ORDER_END,          "order_end" },
        { T_FORWARD,            "forward" },
        { T_BACKWARD,           "backward" },
        { T_POSITION,           "position" },
        { T_IGNORE,             "IGNORE" },
        { T_UNDEFINED,          "UNDEFINED" },
        { T_NUMERIC,            "LC_NUMERIC" },
        { T_DECIMAL_POINT,      "decimal_point" },
        { T_THOUSANDS_SEP,      "thousands_sep" },
        { T_GROUPING,           "grouping" },
        { T_TIME,               "LC_TIME" },
        { T_ABDAY,              "abday" },
        { T_DAY,                "day" },
        { T_ABMON,              "abmon" },
        { T_MON,                "mon" },
        { T_D_T_FMT,            "d_t_fmt" },
        { T_D_FMT,              "d_fmt" },
        { T_T_FMT,              "t_fmt" },
        { T_AM_PM,              "am_pm" },
        { T_T_FMT_AMPM,         "t_fmt_ampm" },
        { T_ERA,                "era" },
        { T_ERA_D_FMT,          "era_d_fmt" },
        { T_ERA_T_FMT,          "era_t_fmt" },
        { T_ERA_D_T_FMT,        "era_d_t_fmt" },
        { T_ALT_DIGITS,         "alt_digits" },
        { T_CTYPE,              "LC_CTYPE" },
        { T_ISUPPER,            "upper" },
        { T_ISLOWER,            "lower" },
        { T_ISALPHA,            "alpha" },
        { T_ISDIGIT,            "digit" },
        { T_ISPUNCT,            "punct" },
        { T_ISXDIGIT,           "xdigit" },
        { T_ISSPACE,            "space" },
        { T_ISPRINT,            "print" },
        { T_ISGRAPH,            "graph" },
        { T_ISBLANK,            "blank" },
        { T_ISCNTRL,            "cntrl" },
        /*
         * These entries are local additions, and not specified by
         * TOG.  Note that they are not guaranteed to be accurate for
         * all locales, and so applications should not depend on them.
         */
        { T_ISSPECIAL,          "special" },
        { T_ISENGLISH,          "english" },
        { T_ISPHONOGRAM,        "phonogram" },
        { T_ISIDEOGRAM,         "ideogram" },
        { T_ISNUMBER,           "number" },
        /*
         * We have to support this in the grammar, but it would be a
         * syntax error to define a character as one of these without
         * also defining it as an alpha or digit.  We ignore it in our
         * parsing.
         */
        { T_ISALNUM,            "alnum" },
        { T_TOUPPER,            "toupper" },
        { T_TOLOWER,            "tolower" },

        /*
         * These are keywords used in the charmap file.  Note that
         * Solaris originally used angle brackets to wrap some of them,
         * but we removed that to simplify our parser.  The first of these
         * items are "global items."
         */
        { T_CHARMAP,            "CHARMAP" },
        { T_WIDTH,              "WIDTH" },

        { -1, NULL },
};

/*
 * These special words are only used in a charmap file, enclosed in <>.
 */
static struct token symwords[] = {
        { T_COM_CHAR,           "comment_char" },
        { T_ESC_CHAR,           "escape_char" },
        { T_CODE_SET,           "code_set_name" },
        { T_MB_CUR_MAX,         "mb_cur_max" },
        { T_MB_CUR_MIN,         "mb_cur_min" },
        { -1, NULL },
};

static int categories[] = {
        T_CHARMAP,
        T_CTYPE,
        T_COLLATE,
        T_MESSAGES,
        T_MONETARY,
        T_NUMERIC,
        T_TIME,
        T_WIDTH,
        0
};

void
reset_scanner(const char *fname)
{
        if (fname == NULL) {
                filename = "<stdin>";
                is_stdin = 1;
        } else {
                if (!is_stdin)
                        (void) fclose(input);
                if ((input = fopen(fname, "r")) == NULL) {
                        perror("fopen");
                        exit(4);
                } else {
                        is_stdin = 0;
                }
                filename = fname;
        }
        com_char = '#';
        esc_char = '\\';
        instring = 0;
        escaped = 0;
        lineno = 1;
        nextline = 1;
        tokidx = 0;
        wideidx = 0;
}

#define hex(x)  \
        (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
#define isodigit(x)     ((x >= '0') && (x <= '7'))

static int
scanc(void)
{
        int     c;

        if (is_stdin)
                c = getc(stdin);
        else
                c = getc(input);
        lineno = nextline;
        if (c == '\n') {
                nextline++;
        }
        return (c);
}

static void
unscanc(int c)
{
        if (c == '\n') {
                nextline--;
        }
        if (ungetc(c, is_stdin ? stdin : input) < 0) {
                yyerror("ungetc failed");
        }
}

static int
scan_hex_byte(void)
{
        int     c1, c2;
        int     v;

        c1 = scanc();
        if (!isxdigit(c1)) {
                yyerror("malformed hex digit");
                return (0);
        }
        c2 = scanc();
        if (!isxdigit(c2)) {
                yyerror("malformed hex digit");
                return (0);
        }
        v = ((hex(c1) << 4) | hex(c2));
        return (v);
}

static int
scan_dec_byte(void)
{
        int     c1, c2, c3;
        int     b;

        c1 = scanc();
        if (!isdigit(c1)) {
                yyerror("malformed decimal digit");
                return (0);
        }
        b = c1 - '0';
        c2 = scanc();
        if (!isdigit(c2)) {
                yyerror("malformed decimal digit");
                return (0);
        }
        b *= 10;
        b += (c2 - '0');
        c3 = scanc();
        if (!isdigit(c3)) {
                unscanc(c3);
        } else {
                b *= 10;
                b += (c3 - '0');
        }
        return (b);
}

static int
scan_oct_byte(void)
{
        int c1, c2, c3;
        int     b;

        b = 0;

        c1 = scanc();
        if (!isodigit(c1)) {
                yyerror("malformed octal digit");
                return (0);
        }
        b = c1 - '0';
        c2 = scanc();
        if (!isodigit(c2)) {
                yyerror("malformed octal digit");
                return (0);
        }
        b *= 8;
        b += (c2 - '0');
        c3 = scanc();
        if (!isodigit(c3)) {
                unscanc(c3);
        } else {
                b *= 8;
                b += (c3 - '0');
        }
        return (b);
}

void
add_tok(int c)
{
        if ((tokidx + 1) >= toksz) {
                toksz += 64;
                if ((token = realloc(token, toksz)) == NULL) {
                        yyerror("out of memory");
                        tokidx = 0;
                        toksz = 0;
                        return;
                }
        }

        token[tokidx++] = (char)c;
        token[tokidx] = 0;
}
void
add_wcs(wchar_t c)
{
        if ((wideidx + 1) >= widesz) {
                widesz += 64;
                widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
                if (widestr == NULL) {
                        yyerror("out of memory");
                        wideidx = 0;
                        widesz = 0;
                        return;
                }
        }

        widestr[wideidx++] = c;
        widestr[wideidx] = 0;
}

wchar_t *
get_wcs(void)
{
        wchar_t *ws = widestr;
        wideidx = 0;
        widestr = NULL;
        widesz = 0;
        if (ws == NULL) {
                if ((ws = wcsdup(L"")) == NULL) {
                        yyerror("out of memory");
                }
        }
        return (ws);
}

static int
get_byte(void)
{
        int     c;

        if ((c = scanc()) != esc_char) {
                unscanc(c);
                return (EOF);
        }
        c = scanc();

        switch (c) {
        case 'd':
        case 'D':
                return (scan_dec_byte());
        case 'x':
        case 'X':
                return (scan_hex_byte());
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
                /* put the character back so we can get it */
                unscanc(c);
                return (scan_oct_byte());
        default:
                unscanc(c);
                unscanc(esc_char);
                return (EOF);
        }
}

int
get_escaped(int c)
{
        switch (c) {
        case 'n':
                return ('\n');
        case 'r':
                return ('\r');
        case 't':
                return ('\t');
        case 'f':
                return ('\f');
        case 'v':
                return ('\v');
        case 'b':
                return ('\b');
        case 'a':
                return ('\a');
        default:
                return (c);
        }
}

int
get_wide(void)
{
        static char mbs[MB_LEN_MAX + 1] = "";
        static int mbi = 0;
        int c;
        wchar_t wc;

        if (mb_cur_max >= (int)sizeof (mbs)) {
                yyerror("max multibyte character size too big");
                mbi = 0;
                return (T_NULL);
        }
        for (;;) {
                if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
                        /*
                         * end of the byte sequence reached, but no
                         * valid wide decoding.  fatal error.
                         */
                        mbi = 0;
                        yyerror("not a valid character encoding");
                        return (T_NULL);
                }
                mbs[mbi++] = c;
                mbs[mbi] = 0;

                /* does it decode? */
                if (to_wide(&wc, mbs) >= 0) {
                        break;
                }
        }

        mbi = 0;
        if ((category != T_CHARMAP) && (category != T_WIDTH)) {
                if (check_charmap(wc) < 0) {
                        yyerror("no symbolic name for character");
                        return (T_NULL);
                }
        }

        yylval.wc = wc;
        return (T_CHAR);
}

int
get_symbol(void)
{
        int     c;

        while ((c = scanc()) != EOF) {
                if (escaped) {
                        escaped = 0;
                        if (c == '\n')
                                continue;
                        add_tok(get_escaped(c));
                        continue;
                }
                if (c == esc_char) {
                        escaped = 1;
                        continue;
                }
                if (c == '\n') {        /* well that's strange! */
                        yyerror("unterminated symbolic name");
                        continue;
                }
                if (c == '>') {         /* end of symbol */

                        /*
                         * This restarts the token from the beginning
                         * the next time we scan a character.  (This
                         * token is complete.)
                         */

                        if (token == NULL) {
                                yyerror("missing symbolic name");
                                return (T_NULL);
                        }
                        tokidx = 0;

                        /*
                         * A few symbols are handled as keywords outside
                         * of the normal categories.
                         */
                        if (category == T_END) {
                                int i;
                                for (i = 0; symwords[i].name != 0; i++) {
                                        if (strcmp(token, symwords[i].name) ==
                                            0) {
                                                last_kw = symwords[i].id;
                                                return (last_kw);
                                        }
                                }
                        }
                        /*
                         * Contextual rule: Only literal characters are
                         * permitted in CHARMAP.  Anywhere else the symbolic
                         * forms are fine.
                         */
                        if ((category != T_CHARMAP) &&
                            (lookup_charmap(token, &yylval.wc)) != -1) {
                                return (T_CHAR);
                        }
                        if ((yylval.collsym = lookup_collsym(token)) != NULL) {
                                return (T_COLLSYM);
                        }
                        if ((yylval.collelem = lookup_collelem(token)) !=
                            NULL) {
                                return (T_COLLELEM);
                        }
                        /* its an undefined symbol */
                        yylval.token = strdup(token);
                        token = NULL;
                        toksz = 0;
                        tokidx = 0;
                        return (T_SYMBOL);
                }
                add_tok(c);
        }

        yyerror("unterminated symbolic name");
        return (EOF);
}

int
get_category(void)
{
        return (category);
}

static int
consume_token(void)
{
        int     len = tokidx;
        int     i;

        tokidx = 0;
        if (token == NULL)
                return (T_NULL);

        /*
         * this one is special, because we don't want it to alter the
         * last_kw field.
         */
        if (strcmp(token, "...") == 0) {
                return (T_ELLIPSIS);
        }

        /* search for reserved words first */
        for (i = 0; keywords[i].name; i++) {
                int j;
                if (strcmp(keywords[i].name, token) != 0) {
                        continue;
                }

                last_kw = keywords[i].id;

                /* clear the top level category if we're done with it */
                if (last_kw == T_END) {
                        category = T_END;
                }

                /* set the top level category if we're changing */
                for (j = 0; categories[j]; j++) {
                        if (categories[j] != last_kw)
                                continue;
                        category = last_kw;
                }

                return (keywords[i].id);
        }

        /* maybe its a numeric constant? */
        if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
                char *eptr;
                yylval.num = strtol(token, &eptr, 10);
                if (*eptr != 0)
                        yyerror("malformed number");
                return (T_NUMBER);
        }

        /*
         * A single lone character is treated as a character literal.
         * To avoid duplication of effort, we stick in the charmap.
         */
        if (len == 1) {
                yylval.wc = token[0];
                return (T_CHAR);
        }

        /* anything else is treated as a symbolic name */
        yylval.token = strdup(token);
        token = NULL;
        toksz = 0;
        tokidx = 0;
        return (T_NAME);
}

void
scan_to_eol(void)
{
        int     c;
        while ((c = scanc()) != '\n') {
                if (c == EOF) {
                        /* end of file without newline! */
                        errf("missing newline");
                        return;
                }
        }
        assert(c == '\n');
}

int
yylex(void)
{
        int             c;

        while ((c = scanc()) != EOF) {

                /* special handling for quoted string */
                if (instring) {
                        if (escaped) {
                                escaped = 0;

                                /* if newline, just eat and forget it */
                                if (c == '\n')
                                        continue;

                                if (strchr("xXd01234567", c)) {
                                        unscanc(c);
                                        unscanc(esc_char);
                                        return (get_wide());
                                }
                                yylval.wc = get_escaped(c);
                                return (T_CHAR);
                        }
                        if (c == esc_char) {
                                escaped = 1;
                                continue;
                        }
                        switch (c) {
                        case '<':
                                return (get_symbol());
                        case '>':
                                /* oops! should generate syntax error  */
                                return (T_GT);
                        case '"':
                                instring = 0;
                                return (T_QUOTE);
                        default:
                                yylval.wc = c;
                                return (T_CHAR);
                        }
                }

                /* escaped characters first */
                if (escaped) {
                        escaped = 0;
                        if (c == '\n') {
                                /* eat the newline */
                                continue;
                        }
                        hadtok = 1;
                        if (tokidx) {
                                /* an escape mid-token is nonsense */
                                return (T_NULL);
                        }

                        /* numeric escapes are treated as wide characters */
                        if (strchr("xXd01234567", c)) {
                                unscanc(c);
                                unscanc(esc_char);
                                return (get_wide());
                        }

                        add_tok(get_escaped(c));
                        continue;
                }

                /* if it is the escape charter itself note it */
                if (c == esc_char) {
                        escaped = 1;
                        continue;
                }

                /* remove from the comment char to end of line */
                if (c == com_char) {
                        while (c != '\n') {
                                if ((c = scanc()) == EOF) {
                                        /* end of file without newline! */
                                        return (EOF);
                                }
                        }
                        assert(c == '\n');
                        if (!hadtok) {
                                /*
                                 * If there were no tokens on this line,
                                 * then just pretend it didn't exist at all.
                                 */
                                continue;
                        }
                        hadtok = 0;
                        return (T_NL);
                }

                if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
                        /*
                         * These are all token delimiters.  If there
                         * is a token already in progress, we need to
                         * process it.
                         */
                        unscanc(c);
                        return (consume_token());
                }

                switch (c) {
                case '\n':
                        if (!hadtok) {
                                /*
                                 * If the line was completely devoid of tokens,
                                 * then just ignore it.
                                 */
                                continue;
                        }
                        /* we're starting a new line, reset the token state */
                        hadtok = 0;
                        return (T_NL);
                case ',':
                        hadtok = 1;
                        return (T_COMMA);
                case ';':
                        hadtok = 1;
                        return (T_SEMI);
                case '(':
                        hadtok = 1;
                        return (T_LPAREN);
                case ')':
                        hadtok = 1;
                        return (T_RPAREN);
                case '>':
                        hadtok = 1;
                        return (T_GT);
                case '<':
                        /* symbol start! */
                        hadtok = 1;
                        return (get_symbol());
                case ' ':
                case '\t':
                        /* whitespace, just ignore it */
                        continue;
                case '"':
                        hadtok = 1;
                        instring = 1;
                        return (T_QUOTE);
                default:
                        hadtok = 1;
                        add_tok(c);
                        continue;
                }
        }
        return (EOF);
}

void
yyerror(const char *msg)
{
        (void) fprintf(stderr, "%s: %d: error: %s\n",
            filename, lineno, msg);
        exit(4);
}

void
errf(const char *fmt, ...)
{
        char    *msg;

        va_list va;
        va_start(va, fmt);
        (void) vasprintf(&msg, fmt, va);
        va_end(va);

        (void) fprintf(stderr, "%s: %d: error: %s\n",
            filename, lineno, msg);
        free(msg);
        exit(4);
}

void
warn(const char *fmt, ...)
{
        char    *msg;

        va_list va;
        va_start(va, fmt);
        (void) vasprintf(&msg, fmt, va);
        va_end(va);

        (void) fprintf(stderr, "%s: %d: warning: %s\n",
            filename, lineno, msg);
        free(msg);
        warnings++;
        if (!warnok)
                exit(4);
}