root/usr/src/cmd/awk/lex.c
/*
 * Copyright (C) Lucent Technologies 1997
 * All Rights Reserved
 *
 * Permission to use, copy, modify, and distribute this software and
 * its documentation for any purpose and without fee is hereby
 * granted, provided that the above copyright notice appear in all
 * copies and that both that the copyright notice and this
 * permission notice and warranty disclaimer appear in supporting
 * documentation, and that the name Lucent Technologies or any of
 * its entities not be used in advertising or publicity pertaining
 * to distribution of the software without specific, written prior
 * permission.
 *
 * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
 * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 * THIS SOFTWARE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "awk.h"
#include "y.tab.h"

extern YYSTYPE  yylval;
extern int      infunc;

off_t   lineno  = 1;
int     bracecnt = 0;
int     brackcnt  = 0;
int     parencnt = 0;

typedef struct Keyword {
        const char *word;
        int     sub;
        int     type;
} Keyword;

Keyword keywords[] = {  /* keep sorted: binary searched */
        { "BEGIN",      XBEGIN,         XBEGIN },
        { "END",        XEND,           XEND },
        { "NF",         VARNF,          VARNF },
        { "atan2",      FATAN,          BLTIN },
        { "break",      BREAK,          BREAK },
        { "close",      CLOSE,          CLOSE },
        { "continue",   CONTINUE,       CONTINUE },
        { "cos",        FCOS,           BLTIN },
        { "delete",     DELETE,         DELETE },
        { "do",         DO,             DO },
        { "else",       ELSE,           ELSE },
        { "exit",       EXIT,           EXIT },
        { "exp",        FEXP,           BLTIN },
        { "fflush",     FFLUSH,         BLTIN },
        { "for",        FOR,            FOR },
        { "func",       FUNC,           FUNC },
        { "function",   FUNC,           FUNC },
        { "getline",    GETLINE,        GETLINE },
        { "gsub",       GSUB,           GSUB },
        { "if",         IF,             IF },
        { "in",         IN,             IN },
        { "index",      INDEX,          INDEX },
        { "int",        FINT,           BLTIN },
        { "length",     FLENGTH,        BLTIN },
        { "log",        FLOG,           BLTIN },
        { "match",      MATCHFCN,       MATCHFCN },
        { "next",       NEXT,           NEXT },
        { "nextfile",   NEXTFILE,       NEXTFILE },
        { "print",      PRINT,          PRINT },
        { "printf",     PRINTF,         PRINTF },
        { "rand",       FRAND,          BLTIN },
        { "return",     RETURN,         RETURN },
        { "sin",        FSIN,           BLTIN },
        { "split",      SPLIT,          SPLIT },
        { "sprintf",    SPRINTF,        SPRINTF },
        { "sqrt",       FSQRT,          BLTIN },
        { "srand",      FSRAND,         BLTIN },
        { "sub",        SUB,            SUB },
        { "substr",     SUBSTR,         SUBSTR },
        { "system",     FSYSTEM,        BLTIN },
        { "tolower",    FTOLOWER,       BLTIN },
        { "toupper",    FTOUPPER,       BLTIN },
        { "while",      WHILE,          WHILE },
};

#define RET(x)  { if (dbg) (void) printf("lex %s\n", tokname(x)); return (x); }

int
peek(void)
{
        int c = input();
        unput(c);
        return (c);
}

int
gettok(char **pbuf, size_t *psz)        /* get next input token */
{
        int c, retc;
        char *buf = *pbuf;
        size_t sz = *psz;
        char *bp = buf;

        c = input();
        if (c == 0)
                return (0);
        buf[0] = c;
        buf[1] = 0;
        if (!isalnum(c) && c != '.' && c != '_')
                return (c);

        *bp++ = c;
        if (isalpha(c) || c == '_') {   /* it's a varname */
                for (; (c = input()) != 0; ) {
                        if (bp-buf >= sz &&
                            !adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
                                FATAL("out of space for name %.10s...", buf);
                        if (isalnum(c) || c == '_')
                                *bp++ = c;
                        else {
                                *bp = 0;
                                unput(c);
                                break;
                        }
                }
                *bp = 0;
                retc = 'a';     /* alphanumeric */
        } else {        /* maybe it's a number, but could be . */
                char *rem;
                /* read input until can't be a number */
                for (; (c = input()) != 0; ) {
                        if (bp-buf >= sz &&
                            !adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
                                FATAL("out of space for number %.10s...", buf);
                        if (isdigit(c) || c == 'e' || c == 'E' ||
                            c == '.' || c == '+' || c == '-')
                                *bp++ = c;
                        else {
                                unput(c);
                                break;
                        }
                }
                *bp = 0;
                (void) strtod(buf, &rem);       /* parse the number */
                if (rem == buf) {       /* it wasn't a valid number at all */
                        buf[1] = 0;     /* return one character as token */
                        retc = buf[0];  /* character is its own type */
                        unputstr(rem+1); /* put rest back for later */
                } else {        /* some prefix was a number */
                        unputstr(rem);  /* put rest back for later */
                        rem[0] = 0;     /* truncate buf after number part */
                        retc = '0';     /* type is number */
                }
        }
        *pbuf = buf;
        *psz = sz;
        return (retc);
}

int     word(char *);
int     string(void);
int     regexpr(void);
int     sc      = 0;    /* 1 => return a } right now */
int     reg     = 0;    /* 1 => return a REGEXPR now */

int
yylex(void)
{
        int c;
        static char *buf = NULL;
        /* BUG: setting this small causes core dump! */
        static size_t bufsize = 5;

        if (buf == NULL && (buf = (char *)malloc(bufsize)) == NULL)
                FATAL("out of space in yylex");
        if (sc) {
                sc = 0;
                RET('}');
        }
        if (reg) {
                reg = 0;
                return (regexpr());
        }
        for (;;) {
                c = gettok(&buf, &bufsize);
                if (c == 0)
                        return (0);
                if (isalpha(c) || c == '_')
                        return (word(buf));
                if (isdigit(c)) {
                        yylval.cp = setsymtab(
                            buf, tostring(buf), atof(buf), CON|NUM, symtab);
                        /* should this also have STR set? */
                        RET(NUMBER);
                }

                yylval.i = c;
                switch (c) {
                case '\n':      /* {EOL} */
                        lineno++;
                        RET(NL);
                case '\r':      /* assume \n is coming */
                case ' ':       /* {WS}+ */
                case '\t':
                        break;
                case '#':       /* #.* strip comments */
                        while ((c = input()) != '\n' && c != 0)
                                ;
                        unput(c);
                        break;
                case ';':
                        RET(';');
                case '\\':
                        if (peek() == '\n') {
                                (void) input();
                                lineno++;
                        } else if (peek() == '\r') {
                                (void) input();
                                (void) input(); /* BUG: check for \n */
                                lineno++;
                        } else {
                                RET(c);
                        }
                        break;
                case '&':
                        if (peek() == '&') {
                                (void) input();
                                RET(AND);
                        } else
                                RET('&');
                case '|':
                        if (peek() == '|') {
                                (void) input();
                                RET(BOR);
                        } else
                                RET('|');
                case '!':
                        if (peek() == '=') {
                                (void) input();
                                yylval.i = NE;
                                RET(NE);
                        } else if (peek() == '~') {
                                (void) input();
                                yylval.i = NOTMATCH;
                                RET(MATCHOP);
                        } else
                                RET(NOT);
                case '~':
                        yylval.i = MATCH;
                        RET(MATCHOP);
                case '<':
                        if (peek() == '=') {
                                (void) input();
                                yylval.i = LE;
                                RET(LE);
                        } else {
                                yylval.i = LT;
                                RET(LT);
                        }
                case '=':
                        if (peek() == '=') {
                                (void) input();
                                yylval.i = EQ;
                                RET(EQ);
                        } else {
                                yylval.i = ASSIGN;
                                RET(ASGNOP);
                        }
                case '>':
                        if (peek() == '=') {
                                (void) input();
                                yylval.i = GE;
                                RET(GE);
                        } else if (peek() == '>') {
                                (void) input();
                                yylval.i = APPEND;
                                RET(APPEND);
                        } else {
                                yylval.i = GT;
                                RET(GT);
                        }
                case '+':
                        if (peek() == '+') {
                                (void) input();
                                yylval.i = INCR;
                                RET(INCR);
                        } else if (peek() == '=') {
                                (void) input();
                                yylval.i = ADDEQ;
                                RET(ASGNOP);
                        } else
                                RET('+');
                case '-':
                        if (peek() == '-') {
                                (void) input();
                                yylval.i = DECR;
                                RET(DECR);
                        } else if (peek() == '=') {
                                (void) input();
                                yylval.i = SUBEQ;
                                RET(ASGNOP);
                        } else
                                RET('-');
                case '*':
                        if (peek() == '=') {    /* *= */
                                (void) input();
                                yylval.i = MULTEQ;
                                RET(ASGNOP);
                        } else if (peek() == '*') {     /* ** or **= */
                                (void) input(); /* eat 2nd * */
                                if (peek() == '=') {
                                        (void) input();
                                        yylval.i = POWEQ;
                                        RET(ASGNOP);
                                } else {
                                        RET(POWER);
                                }
                        } else
                                RET('*');
                case '/':
                        RET('/');
                case '%':
                        if (peek() == '=') {
                                (void) input();
                                yylval.i = MODEQ;
                                RET(ASGNOP);
                        } else
                                RET('%');
                case '^':
                        if (peek() == '=') {
                                (void) input();
                                yylval.i = POWEQ;
                                RET(ASGNOP);
                        } else
                                RET(POWER);

                case '$':
                        /* BUG: awkward, if not wrong */
                        c = gettok(&buf, &bufsize);
                        if (isalpha(c)) {
                                if (strcmp(buf, "NF") == 0) {
                                        /* very special */
                                        unputstr("(NF)");
                                        RET(INDIRECT);
                                }
                                c = peek();
                                if (c == '(' || c == '[' ||
                                    (infunc && isarg(buf) >= 0)) {
                                        unputstr(buf);
                                        RET(INDIRECT);
                                }
                                yylval.cp = setsymtab(
                                    buf, "", 0.0, STR|NUM, symtab);
                                RET(IVAR);
                        } else if (c == 0) {    /*  */
                                SYNTAX("unexpected end of input after $");
                                RET(';');
                        } else {
                                unputstr(buf);
                                RET(INDIRECT);
                        }

                case '}':
                        if (--bracecnt < 0)
                                SYNTAX("extra }");
                        sc = 1;
                        RET(';');
                case ']':
                        if (--brackcnt < 0)
                                SYNTAX("extra ]");
                        RET(']');
                case ')':
                        if (--parencnt < 0)
                                SYNTAX("extra )");
                        RET(')');
                case '{':
                        bracecnt++;
                        RET('{');
                case '[':
                        brackcnt++;
                        RET('[');
                case '(':
                        parencnt++;
                        RET('(');

                case '"':
                        /* BUG: should be like tran.c ? */
                        return (string());

                default:
                        RET(c);
                }
        }
}

int
string(void)
{
        int c, n;
        char *s, *bp;
        static char *buf = NULL;
        static size_t bufsz = 500;

        if (buf == NULL && (buf = (char *)malloc(bufsz)) == NULL)
                FATAL("out of space for strings");
        for (bp = buf; (c = input()) != '"'; ) {
                if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
                        FATAL("out of space for string %.10s...", buf);
                switch (c) {
                case '\n':
                case '\r':
                case 0:
                        *bp = '\0';
                        SYNTAX("non-terminated string %.10s...", buf);
                        if (c == 0)     /* hopeless */
                                FATAL("giving up");
                        lineno++;
                        break;
                case '\\':
                        c = input();
                        switch (c) {
                        case '"': *bp++ = '"'; break;
                        case 'n': *bp++ = '\n'; break;
                        case 't': *bp++ = '\t'; break;
                        case 'f': *bp++ = '\f'; break;
                        case 'r': *bp++ = '\r'; break;
                        case 'b': *bp++ = '\b'; break;
                        case 'v': *bp++ = '\v'; break;
                        case 'a': *bp++ = '\007'; break;
                        case '\\': *bp++ = '\\'; break;

                        case '0': case '1': case '2': /* octal: \d \dd \ddd */
                        case '3': case '4': case '5': case '6': case '7':
                                n = c - '0';
                                if ((c = peek()) >= '0' && c < '8') {
                                        n = 8 * n + input() - '0';
                                        if ((c = peek()) >= '0' && c < '8')
                                                n = 8 * n + input() - '0';
                                }
                                *bp++ = n;
                                break;

                        case 'x': {     /* hex  \x0-9a-fA-F + */
                                char xbuf[100], *px;
                                px = xbuf;
                                while ((c = input()) != 0 && px-xbuf < 100-2) {
                                        if (isdigit(c) ||
                                            (c >= 'a' && c <= 'f') ||
                                            (c >= 'A' && c <= 'F'))
                                                *px++ = c;
                                        else
                                                break;
                                }
                                *px = 0;
                                unput(c);
                                (void) sscanf(xbuf, "%x", (unsigned int *)&n);
                                *bp++ = n;
                                break;
                        }

                        default:
                                *bp++ = c;
                                break;
                        }
                        break;
                default:
                        *bp++ = c;
                        break;
                }
        }
        *bp = 0;
        s = tostring(buf);
        *bp++ = ' '; *bp++ = 0;
        yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
        RET(STRING);
}


int
binsearch(char *w, Keyword *kp, int n)
{
        int cond, low, mid, high;

        low = 0;
        high = n - 1;
        while (low <= high) {
                mid = (low + high) / 2;
                if ((cond = strcmp(w, kp[mid].word)) < 0)
                        high = mid - 1;
                else if (cond > 0)
                        low = mid + 1;
                else
                        return (mid);
        }
        return (-1);
}

int
word(char *w)
{
        Keyword *kp;
        int c, n;

        n = binsearch(w, keywords, sizeof (keywords) / sizeof (keywords[0]));
        if (n != -1) {  /* found in table */
                kp = keywords + n;
                yylval.i = kp->sub;
                switch (kp->type) {     /* special handling */
                case BLTIN:
                        if (kp->sub == FSYSTEM && safe)
                                SYNTAX("system is unsafe");
                        RET(kp->type);
                case FUNC:
                        if (infunc)
                                SYNTAX("illegal nested function");
                        RET(kp->type);
                case RETURN:
                        if (!infunc)
                                SYNTAX("return not in function");
                        RET(kp->type);
                case VARNF:
                        yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
                        RET(VARNF);
                default:
                        RET(kp->type);
                }
        }
        c = peek();     /* look for '(' */
        if (c != '(' && infunc && (n = isarg(w)) >= 0) {
                yylval.i = n;
                RET(ARG);
        } else {
                yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
                if (c == '(') {
                        RET(CALL);
                } else {
                        RET(VAR);
                }
        }
}

void
startreg(void)  /* next call to yylex will return a regular expression */
{
        reg = 1;
}

int
regexpr(void)
{
        int c;
        static char *buf = NULL;
        static size_t bufsz = 500;
        char *bp;

        if (buf == NULL && (buf = (char *)malloc(bufsz)) == NULL)
                FATAL("out of space for rex expr");
        bp = buf;
        for (; (c = input()) != '/' && c != 0; ) {
                if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
                        FATAL("out of space for reg expr %.10s...", buf);
                if (c == '\n') {
                        *bp = '\0';
                        SYNTAX("newline in regular expression %.10s...", buf);
                        unput('\n');
                        break;
                } else if (c == '\\') {
                        *bp++ = '\\';
                        *bp++ = input();
                } else {
                        *bp++ = c;
                }
        }
        *bp = 0;
        if (c == 0)
                SYNTAX("non-terminated regular expression %.10s...", buf);
        yylval.s = tostring(buf);
        unput('/');
        RET(REGEXPR);
}

/* low-level lexical stuff, sort of inherited from lex */

char    ebuf[300];
char    *ep = ebuf;
char    yysbuf[100];    /* pushback buffer */
char    *yysptr = yysbuf;
FILE    *yyin = NULL;

int
input(void)     /* get next lexical input character */
{
        int c;
        extern char *lexprog;

        if (yysptr > yysbuf)
                c = (uschar)*--yysptr;
        else if (lexprog != NULL) {     /* awk '...' */
                if ((c = (uschar)*lexprog) != 0)
                        lexprog++;
        } else                          /* awk -f ... */
                c = pgetc();
        if (c == EOF)
                c = 0;
        if (ep >= ebuf + sizeof (ebuf))
                ep = ebuf;
        *ep = c;
        if (c != 0) {
                ep++;
        }
        return (c);
}

void
unput(int c)    /* put lexical character back on input */
{
        if (yysptr >= yysbuf + sizeof (yysbuf))
                FATAL("pushed back too much: %.20s...", yysbuf);
        *yysptr++ = c;
        if (--ep < ebuf)
                ep = ebuf + sizeof (ebuf) - 1;
}

void
unputstr(const char *s) /* put a string back on input */
{
        int i;

        for (i = strlen(s)-1; i >= 0; i--)
                unput(s[i]);
}