root/usr/src/tools/ndrgen/ndr_lex.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
 */

#include <errno.h>
#include <stdarg.h>
#include "ndrgen.h"
#include "y.tab.h"

/*
 * C-like lexical analysis.
 *
 * 1. Define a "struct node"
 * 2. Define a "struct symbol" that encapsulates a struct node.
 * 3. Define a "struct integer" that encapsulates a struct node.
 * 4. Set the YACC stack type in the grammar:
 *              %{
 *              #define YYSTYPE struct node *
 *              %}
 * 5. Define %token's in the grammer for IDENTIFIER, STRING and INTEGER.
 *    Using "_KW" as a suffix for keyword tokens, i.e. "struct" is
 *    "%token STRUCT_KW":
 *      // atomic values
 *      %token INTEGER STRING IDENTIFIER
 *      // keywords
 *      %token STRUCT_KW CASE_KW
 *      // operators
 *      %token PLUS MINUS ASSIGN ARROW
 *      // overloaded tokens (++ --, < > <= >=, == !=, += -= *= ...)
 *      %token INCOP RELOP EQUOP ASSOP
 * 6. It's easiest to use the yacc(1) generated token numbers for node
 *    labels.  For node labels that are not actually part of the grammer,
 *    use a %token with an L_ prefix:
 *      // node labels (can't be generated by lex)
 *      %token L_LT L_LTE L_GT L_GTE L_EQU L_NEQ
 * 7. Call set_lex_input() before parsing.
 */

#define SQ      '\''
#define DQ      '"'

#define isquote(c) ((c) == SQ || (c) == DQ)
#define iswhite(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\f')

#define is_between(c, l, u)  ((l) <= (c) && (c) <= (u))
#define is_white(c)     ((c) == ' ' || c == '\r' || c == '\t' || c == '\f')
#define is_lower(c)     is_between((c), 'a', 'z')
#define is_upper(c)     is_between((c), 'A', 'Z')
#define is_alpha(c)     (is_lower(c) || is_upper(c))
#define is_digit(c)     is_between((c), '0', '9')
#define is_sstart(c)    (is_alpha(c) || (c) == '_')
#define is_sfollow(c)   (is_sstart(c) || is_digit(c))
#define is_xdigit(c)    \
        (is_digit(c) || is_between((c), 'A', 'F') || is_between((c), 'a', 'f'))

ndr_symbol_t            *symbol_list;
static ndr_integer_t    *integer_list;
static FILE             *lex_infp;
static ndr_symbol_t     *file_name;
int                     line_number;
int                     n_compile_error;

static int              lex_at_bol;

/* In yacc(1) generated parser */
extern struct node      *yylval;

/*
 * The keywtab[] and optable[] could be external to this lex
 * and it would all still work.
 */
static ndr_keyword_t keywtable[] = {
        { "struct",     STRUCT_KW,      0 },
        { "union",      UNION_KW,       0 },
        { "typedef",    TYPEDEF_KW,     0 },

        { "interface",  INTERFACE_KW,   0 },
        { "uuid",       UUID_KW,        0 },
        { "_no_reorder", _NO_REORDER_KW, 0 },
        { "extern",     EXTERN_KW,      0 },
        { "reference",  REFERENCE_KW,   0 },

        { "align",      ALIGN_KW,       0 },
        { "operation",  OPERATION_KW,   0 },
        { "in",         IN_KW,          0 },
        { "out",        OUT_KW,         0 },

        { "string",     STRING_KW,      0 },
        { "size_is",    SIZE_IS_KW,     0 },
        { "length_is",  LENGTH_IS_KW,   0 },

        { "switch_is",  SWITCH_IS_KW,   0 },
        { "case",       CASE_KW,        0 },
        { "default",    DEFAULT_KW,     0 },

        { "transmit_as", TRANSMIT_AS_KW, 0 },
        { "arg_is",     ARG_IS_KW,      0 },
        { "fake",       FAKE_KW,        0 },

        { "char",       BASIC_TYPE,     1 },
        { "uchar",      BASIC_TYPE,     1 },
        { "wchar",      BASIC_TYPE,     2 },
        { "short",      BASIC_TYPE,     2 },
        { "ushort",     BASIC_TYPE,     2 },
        { "long",       BASIC_TYPE,     4 },
        { "ulong",      BASIC_TYPE,     4 },
        {0}
};

static ndr_keyword_t optable[] = {
        { "{",          LC,             0 },
        { "}",          RC,             0 },
        { "(",          LP,             0 },
        { ")",          RP,             0 },
        { "[",          LB,             0 },
        { "]",          RB,             0 },
        { "*",          STAR,           0 },
        { "/",          DIV,            0 },
        { "%",          MOD,            0 },
        { "-",          MINUS,          0 },
        { "+",          PLUS,           0 },
        { "&",          AND,            0 },
        { "|",          OR,             0 },
        { "^",          XOR,            0 },
        { ";",          SEMI,           0 },
        {0}
};

static int getch(FILE *fp);
static ndr_integer_t *int_enter(long);
static ndr_symbol_t *sym_enter(char *);
static ndr_symbol_t *sym_find(char *);
static int str_to_sv(char *, char *sv[]);

/*
 * Enter the symbols for keyword.
 */
static void
keyw_tab_init(ndr_keyword_t kwtable[])
{
        int                     i;
        ndr_keyword_t           *kw;
        ndr_symbol_t            *sym;

        for (i = 0; kwtable[i].name; i++) {
                kw = &kwtable[i];

                sym = sym_enter(kw->name);
                sym->kw = kw;
        }
}

void
set_lex_input(FILE *fp, char *name)
{
        keyw_tab_init(keywtable);
        keyw_tab_init(optable);

        lex_infp = fp;
        file_name = sym_enter(name);
        line_number = 1;
        lex_at_bol = 1;
}

static int
getch(FILE *fp)
{
        return (getc(fp));
}

int
yylex(void)
{
        char            lexeme[512];
        char            *p = lexeme;
        FILE            *fp = lex_infp;
        int             c, xc;
        ndr_symbol_t    *sym;
        ndr_integer_t   *intg;

top:
        p = lexeme;

        c = getch(fp);
        if (c == EOF)
                return (EOF);

        if (c == '\n') {
                line_number++;
                lex_at_bol = 1;
                goto top;
        }

        /*
         * Handle preprocessor lines. This just notes
         * which file we're processing.
         */
        if (c == '#' && lex_at_bol) {
                char            *sv[10];
                int             sc;

                while ((c = getch(fp)) != EOF && c != '\n')
                        *p++ = c;

                *p = 0;
                /* note: no ungetc() of newline, we don't want to count it */

                if (*lexeme != ' ') {
                        /* not a line we know */
                        goto top;
                }

                sc = str_to_sv(lexeme, sv);
                if (sc < 2)
                        goto top;

                file_name = sym_enter(sv[1]);
                line_number = atoi(sv[0]);      /* for next input line */
                lex_at_bol = 1;
                goto top;
        }

        lex_at_bol = 0;

        /*
         * Skip white space
         */
        if (is_white(c))
                goto top;

        /*
         * Symbol? Might be a keyword or just an identifier
         */
        if (is_sstart(c)) {
                /* we got a symbol */
                do {
                        *p++ = c;
                        c = getch(fp);
                } while (is_sfollow(c));
                (void) ungetc(c, fp);
                *p = 0;

                sym = sym_enter(lexeme);

                yylval = &sym->s_node;

                if (sym->kw) {
                        return (sym->kw->token);
                } else {
                        return (IDENTIFIER);
                }
        }

        /*
         * Integer constant?
         */
        if (is_digit(c)) {
                /* we got a number */
                *p++ = c;
                if (c == '0') {
                        c = getch(fp);
                        if (c == 'x' || c == 'X') {
                                /* handle hex specially */
                                do {
                                        *p++ = c;
                                        c = getch(fp);
                                } while (is_xdigit(c));
                                goto convert_icon;
                        } else if (c == 'b' || c == 'B' ||
                            c == 'd' || c == 'D' ||
                            c == 'o' || c == 'O') {
                                do {
                                        *p++ = c;
                                        c = getch(fp);
                                } while (is_digit(c));
                                goto convert_icon;
                        }
                        (void) ungetc(c, fp);
                }
                /* could be anything */
                c = getch(fp);
                while (is_digit(c)) {
                        *p++ = c;
                        c = getch(fp);
                }

convert_icon:
                *p = 0;
                (void) ungetc(c, fp);

                intg = int_enter(strtol(lexeme, 0, 0));
                yylval = &intg->s_node;

                return (INTEGER);
        }

        /* Could handle strings. We don't seem to need them yet */

        yylval = 0;             /* operator tokens have no value */
        xc = getch(fp);         /* get look-ahead for two-char lexemes */

        lexeme[0] = c;
        lexeme[1] = xc;
        lexeme[2] = 0;

        /*
         * Look for to-end-of-line comment
         */
        if (c == '/' && xc == '/') {
                /* eat the comment */
                while ((c = getch(fp)) != EOF && c != '\n')
                        ;
                (void) ungetc(c, fp);           /* put back newline */
                goto top;
        }

        /*
         * Look for multi-line comment
         */
        if (c == '/' && xc == '*') {
                /* eat the comment */
                xc = -1;
                while ((c = getch(fp)) != EOF) {
                        if (xc == '*' && c == '/') {
                                /* that's it */
                                break;
                        }
                        xc = c;
                        if (c == '\n')
                                line_number++;
                }
                goto top;
        }

        /*
         * Use symbol table lookup for two-character and
         * one character operator tokens.
         */
        sym = sym_find(lexeme);
        if (sym) {
                /* there better be a keyword attached */
                yylval = &sym->s_node;
                return (sym->kw->token);
        }

        /* Try a one-character form */
        (void) ungetc(xc, fp);
        lexeme[1] = 0;
        sym = sym_find(lexeme);
        if (sym) {
                /* there better be a keyword attached */
                yylval = &sym->s_node;
                return (sym->kw->token);
        }

        if (is_between(c, ' ', '~'))
                compile_error("unrecognized character: 0x%02x (%c)", c, c);
        else
                compile_error("unrecognized character: 0x%02x", c);
        goto top;
}

static ndr_symbol_t *
sym_find(char *name)
{
        ndr_symbol_t            **pp;
        ndr_symbol_t            *p;

        for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
                if (strcmp(p->name, name) == 0)
                        return (p);
        }

        return (0);
}

static ndr_symbol_t *
sym_enter(char *name)
{
        ndr_symbol_t            **pp;
        ndr_symbol_t            *p;

        for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
                if (strcmp(p->name, name) == 0)
                        return (p);
        }

        p = ndr_alloc(1, sizeof (ndr_symbol_t));

        if ((p->name = strdup(name)) == NULL)
                fatal_error("%s", strerror(ENOMEM));

        p->s_node.label = IDENTIFIER;
        p->s_node.n_sym = p;

        *pp = p;

        return (p);
}

static ndr_integer_t *
int_enter(long value)
{
        ndr_integer_t           **pp;
        ndr_integer_t           *p;

        for (pp = &integer_list; (p = *pp) != 0; pp = &p->next) {
                if (p->value == value)
                        return (p);
        }

        p = ndr_alloc(1, sizeof (ndr_integer_t));

        p->value = value;
        p->s_node.label = INTEGER;
        p->s_node.n_int = value;

        *pp = p;

        return (p);
}

void *
ndr_alloc(size_t nelem, size_t elsize)
{
        void *p;

        if ((p = calloc(nelem, elsize)) == NULL) {
                fatal_error("%s", strerror(ENOMEM));
                /* NOTREACHED */
        }

        return (p);
}

/*
 * The input context (filename, line number) is maintained by the
 * lexical analysis, and we generally want such info reported for
 * errors in a consistent manner.
 */
void
compile_error(const char *fmt, ...)
{
        char    buf[NDLBUFSZ];
        va_list ap;

        va_start(ap, fmt);
        (void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
        va_end(ap);

        (void) fprintf(stderr, "ndrgen: compile error: %s:%d: %s\n",
            file_name->name, line_number, buf);

        n_compile_error++;
}

void
fatal_error(const char *fmt, ...)
{
        char    buf[NDLBUFSZ];
        va_list ap;

        va_start(ap, fmt);
        (void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
        va_end(ap);

        (void) fprintf(stderr, "ndrgen: fatal error: %s\n", buf);
        exit(1);
}

/*
 * Setup nodes for the lexical analyzer.
 */
struct node *
n_cons(int label, ...)
{
        ndr_node_t              *np;
        va_list ap;

        np = ndr_alloc(1, sizeof (ndr_node_t));

        va_start(ap, label);
        np->label = label;
        np->n_arg[0] = va_arg(ap, void *);
        np->n_arg[1] = va_arg(ap, void *);
        np->n_arg[2] = va_arg(ap, void *);
        va_end(ap);

        np->line_number = line_number;
        np->file_name = file_name;

        return (np);
}

/*
 *      list:   item
 *      |       list item       ={ n_splice($1, $2); }
 *      ;
 */
void
n_splice(struct node *np1, struct node *np2)
{
        while (np1->n_next)
                np1 = np1->n_next;

        np1->n_next = np2;
}

/*
 * Convert a string of words to a vector of strings.
 * Returns the number of words.
 */
static int
str_to_sv(char *buf, char *sv[])
{
        char            **pp = sv;
        char            *p = buf;
        char            *q = buf;
        int             in_word = 0;
        int             c;

        for (;;) {
                c = *p++;
                if (c == 0)
                        break;

                if (!in_word) {
                        if (iswhite(c))
                                continue;

                        *pp++ = q;
                        in_word = 1;
                }

                if (isquote(c)) {
                        int             qc = c;

                        while (((c = *p++) != 0) && (c != qc))
                                *q++ = c;
                        if (c == 0)
                                break;
                } else if (iswhite(c)) {
                        /* end of word */
                        *q++ = 0;
                        in_word = 0;
                } else {
                        /* still inside word */
                        *q++ = c;
                }
        }

        if (in_word)
                *q++ = 0;

        *pp = (char *)0;
        return (pp - sv);
}