root/usr/src/cmd/sgs/lex/common/sub3.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * sub3.c ... ALE enhancement.
 * Since a typical Asian language has a huge character set, it is not
 * ideal to index an array by a character code itself, which requires
 * as large as 2**16 entries per array.
 * To get arround this problem, we identify a set of characters that
 * causes the same transition on all states and call it character group.
 * Every character in a same character group has a unique number called
 * character group id.  A function yycgid(c) maps the character c (in process
 * code) to the id.  This mapping is determined by analyzing all regular
 * expressions in the lex program.
 *
 */
#include        <stdlib.h>
#include        <widec.h>
#include        <search.h>
#include        "ldefs.h"

/*
 * "lchar" stands for linearized character.  It is a variant of
 * process code.  AT&T's 16-bit process code has a drawback in which
 * for three three process code C, D and E where C <= D <= E,
 * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C).
 * In other words, four codesets alternates as the magnitude
 * of character increases.
 * The lchar representation holds this property:
 *   If three lchar C', D' and E' have the relationship C' < D' <  E' and
 *   codeset(C') == codeset(E') then D' is guaranteed to belong to
 *   the same codeset as C' and E'.
 * lchar is implemented as 32 bit entities and the function linearize()
 * that maps a wchar_t to lchar is defined below.  There is no
 * reverse function for it though.
 * The 32-bit process code by AT&T, used only for Taiwanese version at the
 * time of wrting, has no such problem and we use it as it is.
 */

lchar   yycgidtbl[MAXNCG] = {
        0,              /* For ease of computation of the id. */
        '\n',           /* Newline is always special because '.' exclude it. */
        0x000000ff,     /* The upper limit of codeset 0. */
        0x20ffffff,     /* The upper limit of codeset 2. */
        0x40ffffff      /* The upper limit of codeset 3. */
/*      0x60ffffff         The upper limit of codeset 1. */
        /* Above assumes the number of significant bits of wchar_t is <= 24. */
};
int     ncgidtbl = 5; /* # elements in yycgidtbl. */
int     ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */
                /* returns plus 1. */

static void setsymbol(int i);

/*
 * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below:
 *
 *      wc: axxxxxxbyyyyyyy
 *
 * returns: 0ab0000000000000axxxxxxxbyyyyyyy
 *
 * linearize() doesn't do any if compiled with 32-bit wchar_t, use of
 * which is flagged with LONG_WCHAR_T macro.
 * NOTE:
 * The implementation is highly depends on the process code representation.
 * This function should be modified when 32-bit process code is used.
 * There is no need to keep 'a' and 'b' bits in the lower half of lchar.
 * You can actually omit these and squeeze the xxxxxx part one bit right.
 * We don't do that here just in sake of speed.
 */
lchar
linearize(wchar_t wc)
{
#ifdef LONG_WCHAR_T
        return ((lchar)wc); /* Don't do anything. */
#else

        lchar   prefix;
        switch (wc&0x8080) {
        case 0x0000: prefix = 0x00000000; break;
        case 0x0080: prefix = 0x20000000; break;
        case 0x8000: prefix = 0x40000000; break;
        case 0x8080: prefix = 0x60000000; break;
        }
        return (prefix|wc);
#endif
}

/* compare liniear characters pointed to by pc1 and pc2 */
int
cmplc(const void *arg1, const void *arg2)
{
        lchar *pc1 = (lchar *)arg1;
        lchar *pc2 = (lchar *)arg2;

        if (*pc1 > *pc2)
                return (1);
        else if (*pc1 == *pc2)
                return (0);
        else
                return (-1);
}

void
remch(wchar_t c)
{
        lchar   lc = linearize(c);
        size_t  local_ncgidtbl;

        /*
         * User-friendliness consideration:
         * Make sure no EUC chars are used in reg. exp.
         */
        if (!handleeuc) {
                if (!isascii(c)) {
                        if (iswprint(c))
                                warning(
"Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c);
                        else warning(
"Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c);
                }
                /* In any case, we don't need to construct ncgidtbl[]. */
                return;
        }

        /*
         * lsearch wants ncgidtbl to be size_t, but it is int. Hence,
         * the use of local_ncgidtbl to satisfy the calling interface.
         */
        local_ncgidtbl = ncgidtbl;
        (void) lsearch(&lc, yycgidtbl,
            &local_ncgidtbl, sizeof (lchar), cmplc);
        ncgidtbl = (int)local_ncgidtbl;
}

void
sortcgidtbl(void)
{
        if (!handleeuc)
                return;
        qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc);
}

/*
 * int yycgid(wchar_t c)
 *      Takes c and returns its character group id, determind by the
 *      following algorithm.  The program also uses the binary search
 *      algorithm, generalized from Knuth (6.2.1) Algorithm B.
 *
 *      This function computes the "character group id" based on
 *      a table yycgidtbl of which each lchar entry is pre-sorted
 *      in ascending sequence  The number of valid entries is given
 *      by YYNCGIDTBL.  There is no duplicate entries in yycgidtbl.
 *              const int YYNCGIDTBL;
 *              lchar   yycgidtbl[YYNCGIDTBL];
 *
 *      yycgidtbl[0] is guaranteed to have zero.
 *
 *      For given c, yycgid(c) returns:
 *              2*i     iff yycgidtbl[i] == lc
 *              2*i+1   iff yycgidtbl[i] < lc < yycgidtbl[i+1]
 *              YYNCGIDTBL*2-1
 *                      iff yycgidtbl[YYNCGIDTBL-1] < lc
 *      where lc=linearize(c).
 *
 *      Some interesting properties.:
 *      1.  For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1
 *      2.  yycgid(c) == 0  iff  c == 0.
 *      3.  For any wchar_t c and d, if linearize(c) < linearize(d) then
 *          yycgid(c) <= yycgid(d).
 *      4.  For any wchar_t c and d, if yycgid(c) < yycgid(d) then
 *          linearize(c) < linearize(d).
 */
#define YYNCGIDTBL ncgidtbl

int
yycgid(wchar_t c)
{
        int first = 0;
        int last = YYNCGIDTBL - 1;
        lchar lc;

        /*
         * In ASCII compat. mode, each character forms a "group" and the
         * group-id is itself...
         */
        if (!handleeuc)
                return (c);

        lc = linearize(c);

        /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */
        if (yycgidtbl[YYNCGIDTBL - 1] < lc)
                return (YYNCGIDTBL*2 - 1);

        while (last >= 0) {
                int i = (first+last)/2;
                if (lc == yycgidtbl[i])
                        return (2*i);   /* lc exactly matches an element. */
                else if (yycgidtbl[i] < lc) {
                        if (lc < yycgidtbl[i+1]) {
                                /* lc is in between two elements */
                                return (2*i+1);
                        }
                        else
                                first = i + 1;
                } else
                        last = i - 1;
        }
        error(
        "system error in yycgid():binary search failed for c=0x%04x\n", c);
        return (0);
}

/*
 * repbycgid --- replaces each character in the parsing tree by its
 * character group id.   This, however, should be called even in
 * the ASCII compat. mode to process DOT nodes and to call cclinter()
 * for the DOT and CCL nodes.
 */
void
repbycgid(void)
{
        int i, c;

        for (i = 0; i < tptr; ++i) {
                c = name[i];
                if (!ISOPERATOR(c)) {
                /* If not an operator, it must be a char.  */
                        name[i] = yycgid((wchar_t)c); /* So replace it. */
#ifdef DEBUG
                        if (debug) {
                                printf("name[%d]:'%c'->%d;\n", i, c, name[i]);
                        }
#endif
                } else if (c == RSTR) {
                        c = right[i];
                        right[i] = yycgid((wchar_t)c);
#ifdef DEBUG
                        if (debug) {
                                printf(
                                    "name[%d].right:'%c'->%d;\n",
                                    i, c, right[i]);
                        }
#endif
                } else if ((c == RCCL) || (c == RNCCL)) {
                        CHR cc, *s;
                        int j;
                        CHR ccltoken[CCLSIZE];
                        CHR *ccp;
                        int m;
                        /*
                         * This node represetns a character class RE [ccccc]
                         * s points to the string of characters that forms
                         * the class and/or a special prefix notation
                         * <RANGE>XY which corresponds to the RE X-Y,
                         * characters in the range of X and Y.  Here,
                         * X <= Y is guranteed.
                         * We transform these characters into a string
                         * of sorted character group ids.
                         *
                         * There is another mechanism of packing tables
                         * that is inherited from the ASCII lex.  Call of
                         * cclinter() is required for this packing.
                         * This used to be done as yylex() reads the lex
                         * rules but we have to do this here because the
                         * transition table is made to work on the char-group
                         * ids and the mapping cannot be determined until
                         * the entire file is read.
                         */
#ifdef DEBUG
                        if (debug) {
                                printf("name[%d]:R[N]CCL of \"", i);
                                strpt(left[i]);
                                printf(" -> {");
                        }
#endif
                        /* Prepare symbol[] for cclinter(). */
                        for (j = 0; j < ncg; ++j)
                                symbol[j] = FALSE;

                        s = (CHR *) left[i];
                        while ((cc = *s++) != 0) {
                                if (cc == RANGE) {
                                        int     low, high, i;
                                        /*
                                         * Special form: <RANGE>XY
                                         * This means the range X-Y.
                                         * We mark all symbols[]
                                         * elements for yycgid(X) thru
                                         * yycgid(Y), inclusively.
                                         */
                                        low = yycgid(*s++);
                                        high = yycgid(*s++);
                                        for (i = low; i <= high; ++i)
                                                setsymbol(i);
                                } else {
                                        setsymbol(yycgid(cc));
                                }
                        }

                        /* Now make a transformed string of cgids. */
                        s = ccptr;
                        m = 0;
                        for (j = 0; j < ncg; ++j)
                                if (symbol[j]) {
                                        ccltoken[m++] = (CHR)j;
#ifdef DEBUG
                                        if (debug) printf("%d, ", j);
#endif
                                }

#ifdef DEBUG
                        if (debug) printf("}\n");
#endif
                        ccltoken[m] = 0;
                        ccp = ccl;
                        while (ccp < ccptr && scomp(ccltoken, ccp) != 0)
                                ccp++;
                        if (ccp < ccptr) {  /* character class found in ccl */
                                left[i] = (int)ccp;
                        } else { /* not in ccl, add it */
                                left[i] = (int)ccptr;
                                scopy(ccltoken, ccptr);
                                ccptr += slength(ccltoken) + 1;
                                if (ccptr > ccl + CCLSIZE)
                                        error(
                                        "Too many large character classes");
                        }
                        cclinter(c == RCCL);
                } else if (c == DOT) {
                        if (psave == 0) { /* First DOT node. */
                                int j, nlid;
                                /*
                                 * Make symbol[k]=TRUE for all k
                                 *  except k == yycgid('\n').
                                 */
                                nlid = yycgid('\n');
                                psave = ccptr;
                                for (j = 1; j < ncg; ++j) {
                                        if (j == nlid) {
                                                symbol[j] = FALSE;
                                        } else {
                                                symbol[j] = TRUE;
                                                *ccptr++ = (CHR) j;
                                        }
                                }
                                *ccptr++ = 0;
                                if (ccptr > ccl + CCLSIZE)
                                        error(
                                        "Too many large character classes");
                        }
                        /* Mimic mn1(RCCL,psave)... */
                        name[i] = RCCL;
                        left[i] = (int)psave;
                        cclinter(1);
                }
        }
#ifdef DEBUG
        if (debug) {
                printf("treedump after repbycgid().\n");
                treedump();
        }
#endif
}

static void
setsymbol(int i)
{
        if (i > (int)sizeof (symbol))
                error("setsymbol: (SYSERR) %d out of range", i);
        symbol[i] = TRUE;
}