usr/src/cmd/spell/spellprog.c

root/usr/src/cmd/spell/spellprog.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2015 Gary Mills
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*        All Rights Reserved   */

#include <stdlib.h>
#include <unistd.h>
#include <limits.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include <locale.h>
#include "hash.h"

#define Tolower(c) (isupper(c)?tolower(c):c)
#define DLEV 2

/*
 * ANSI prototypes
 */
static int      ily(char *, char *, char *, int);
static int      s(char *, char *, char *, int);
static int      es(char *, char *, char *, int);
static int      subst(char *, char *, char *, int);
static int      nop(void);
static int      bility(char *, char *, char *, int);
static int      i_to_y(char *, char *, char *, int);
static int      CCe(char *, char *, char *, int);
static int      y_to_e(char *, char *, char *, int);
static int      strip(char *, char *, char *, int);
static int      ize(char *, char *, char *, int);
static int      tion(char *, char *, char *, int);
static int      an(char *, char *, char *, int);
int             prime(char *);
static int      tryword(char *, char *, int);
static int      trypref(char *, char *, int);
static int      trysuff(char *, int);
static int      vowel(int);
static int      dict(char *, char *);
static int      monosyl(char *, char *);
static int      VCe(char *, char *, char *, int);
static char     *skipv(char *);

struct suftab {
        char *suf;
        int (*p1)();
        int n1;
        char *d1;
        char *a1;
        int (*p2)();
        int n2;
        char *d2;
        char *a2;
};

static struct suftab sufa[] = {
        {"ssen", ily, 4, "-y+iness", "+ness" },
        {"ssel", ily, 4, "-y+i+less", "+less" },
        {"se", s, 1, "", "+s",  es, 2, "-y+ies", "+es" },
        {"s'", s, 2, "", "+'s"},
        {"s", s, 1, "", "+s"},
        {"ecn", subst, 1, "-t+ce", ""},
        {"ycn", subst, 1, "-t+cy", ""},
        {"ytilb", nop, 0, "", ""},
        {"ytilib", bility, 5, "-le+ility", ""},
        {"elbaif", i_to_y, 4, "-y+iable", ""},
        {"elba", CCe, 4, "-e+able", "+able"},
        {"yti", CCe, 3, "-e+ity", "+ity"},
        {"ylb", y_to_e, 1, "-e+y", ""},
        {"yl", ily, 2, "-y+ily", "+ly"},
        {"laci", strip, 2, "", "+al"},
        {"latnem", strip, 2, "", "+al"},
        {"lanoi", strip, 2, "", "+al"},
        {"tnem", strip, 4, "", "+ment"},
        {"gni", CCe, 3, "-e+ing", "+ing"},
        {"reta", nop, 0, "", ""},
        {"retc", nop, 0, "", ""},
        {"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
        {"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
        {"citsi", strip, 2, "", "+ic"},
        {"citi", ize, 1, "-ic+e", ""},
        {"cihparg", i_to_y, 1, "-y+ic", ""},
        {"tse", strip, 2, "", "+st",    i_to_y, 3, "-y+iest", "+est"},
        {"cirtem", i_to_y, 1, "-y+ic", ""},
        {"yrtem", subst, 0, "-er+ry", ""},
        {"cigol", i_to_y, 1, "-y+ic", ""},
        {"tsigol", i_to_y, 2, "-y+ist", ""},
        {"tsi", CCe, 3, "-e+ist", "+ist"},
        {"msi", CCe, 3, "-e+ism", "+ist"},
        {"noitacifi", i_to_y, 6, "-y+ication", ""},
        {"noitazi", ize, 4, "-e+ation", ""},
        {"rota", tion, 2, "-e+or", ""},
        {"rotc", tion, 2, "", "+or"},
        {"noit", tion, 3, "-e+ion", "+ion"},
        {"naino", an, 3, "", "+ian"},
        {"na", an, 1, "", "+n"},
        {"evi", subst, 0, "-ion+ive", ""},
        {"ezi", CCe, 3, "-e+ize", "+ize"},
        {"pihs", strip, 4, "", "+ship"},
        {"dooh", ily, 4, "-y+ihood", "+hood"},
        {"luf", ily, 3, "-y+iful", "+ful"},
        {"ekil", strip, 4, "", "+like"},
        0
};

static struct suftab sufb[] = {
        {"ssen", ily, 4, "-y+iness", "+ness" },
        {"ssel", ily, 4, "-y+i+less", "+less" },
        {"se", s, 1, "", "+s",  es, 2, "-y+ies", "+es" },
        {"s'", s, 2, "", "+'s"},
        {"s", s, 1, "", "+s"},
        {"ecn", subst, 1, "-t+ce", ""},
        {"ycn", subst, 1, "-t+cy", ""},
        {"ytilb", nop, 0, "", ""},
        {"ytilib", bility, 5, "-le+ility", ""},
        {"elbaif", i_to_y, 4, "-y+iable", ""},
        {"elba", CCe, 4, "-e+able", "+able"},
        {"yti", CCe, 3, "-e+ity", "+ity"},
        {"ylb", y_to_e, 1, "-e+y", ""},
        {"yl", ily, 2, "-y+ily", "+ly"},
        {"laci", strip, 2, "", "+al"},
        {"latnem", strip, 2, "", "+al"},
        {"lanoi", strip, 2, "", "+al"},
        {"tnem", strip, 4, "", "+ment"},
        {"gni", CCe, 3, "-e+ing", "+ing"},
        {"reta", nop, 0, "", ""},
        {"retc", nop, 0, "", ""},
        {"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
        {"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
        {"citsi", strip, 2, "", "+ic"},
        {"citi", ize, 1, "-ic+e", ""},
        {"cihparg", i_to_y, 1, "-y+ic", ""},
        {"tse", strip, 2, "", "+st",    i_to_y, 3, "-y+iest", "+est"},
        {"cirtem", i_to_y, 1, "-y+ic", ""},
        {"yrtem", subst, 0, "-er+ry", ""},
        {"cigol", i_to_y, 1, "-y+ic", ""},
        {"tsigol", i_to_y, 2, "-y+ist", ""},
        {"tsi", CCe, 3, "-e+ist", "+ist"},
        {"msi", CCe, 3, "-e+ism", "+ist"},
        {"noitacifi", i_to_y, 6, "-y+ication", ""},
        {"noitasi", ize, 4, "-e+ation", ""},
        {"rota", tion, 2, "-e+or", ""},
        {"rotc", tion, 2, "", "+or"},
        {"noit", tion, 3, "-e+ion", "+ion"},
        {"naino", an, 3, "", "+ian"},
        {"na", an, 1, "", "+n"},
        {"evi", subst, 0, "-ion+ive", ""},
        {"esi", CCe, 3, "-e+ise", "+ise"},
        {"pihs", strip, 4, "", "+ship"},
        {"dooh", ily, 4, "-y+ihood", "+hood"},
        {"luf", ily, 3, "-y+iful", "+ful"},
        {"ekil", strip, 4, "", "+like"},
        0
};

static char *preftab[] = {
        "anti",
        "auto",
        "bio",
        "counter",
        "dis",
        "electro",
        "en",
        "fore",
        "geo",
        "hyper",
        "intra",
        "inter",
        "iso",
        "kilo",
        "magneto",
        "meta",
        "micro",
        "mid",
        "milli",
        "mis",
        "mono",
        "multi",
        "non",
        "out",
        "over",
        "photo",
        "poly",
        "pre",
        "pseudo",
        "psycho",
        "re",
        "semi",
        "stereo",
        "sub",
        "super",
        "tele",
        "thermo",
        "ultra",
        "under",        /* must precede un */
        "un",
        0
};

static int bflag;
static int vflag;
static int xflag;
static struct suftab *suftab;
static char *prog;
static char word[LINE_MAX];
static char original[LINE_MAX];
static char *deriv[LINE_MAX];
static char affix[LINE_MAX];
static FILE *file, *found;
/*
 *      deriv is stack of pointers to notes like +micro +ed
 *      affix is concatenated string of notes
 *      the buffer size 141 stems from the sizes of original and affix.
 */

/*
 *      in an attempt to defray future maintenance misunderstandings, here is
 *      an attempt to describe the input/output expectations of the spell
 *      program.
 *
 *      spellprog is intended to be called from the shell file spell.
 *      because of this, there is little error checking (this is historical, not
 *      necessarily advisable).
 *
 *      spellprog options hashed-list pass
 *
 *      the hashed-list is a list of the form made by spellin.
 *      there are 2 types of hashed lists:
 *              1. a stop list: this specifies words that by the rules embodied
 *                 in spellprog would be recognized as correct, BUT are really
 *                 errors.
 *              2. a dictionary of correctly spelled words.
 *      the pass number determines how the words found in the specified
 *      hashed-list are treated. If the pass number is 1, the hashed-list is
 *      treated as the stop-list, otherwise, it is treated as the regular
 *      dictionary list. in this case, the value of "pass" is a filename. Found
 *      words are written to this file.
 *
 *      In the normal case, the filename = /dev/null. However, if the v option
 *      is specified, the derivations are written to this file.
 *      The spellprog looks up words in the hashed-list; if a word is found, it
 *      is printed to the stdout. If the hashed-list was the stop-list, the
 *      words found are presumed to be misspellings. in this case,
 *      a control character is printed ( a "-" is appended to the word.
 *      a hyphen will never occur naturally in the input list because deroff
 *      is used in the shell file before calling spellprog.)
 *      If the regualar spelling list was used (hlista or hlistb), the words
 *      are correct, and may be ditched. (unless the -v option was used -
 *      see the manual page).
 *
 *      spellprog should be called twice : first with the stop-list, to flag all
 *      a priori incorrectly spelled words; second with the dictionary.
 *
 *      spellprog hstop 1 |\
 *      spellprog hlista /dev/null
 *
 *      for a complete scenario, see the shell file: spell.
 *
 */

int
main(int argc, char **argv)
{
        char *ep, *cp;
        char *dp;
        int fold;
        int c, j;
        int pass;

        /* Set locale environment variables local definitions */
        (void) setlocale(LC_ALL, "");
#if !defined(TEXT_DOMAIN)       /* Should be defined by cc -D */
#define TEXT_DOMAIN "SYS_TEST"  /* Use this only if it wasn't */
#endif
        (void) textdomain(TEXT_DOMAIN);


        prog = argv[0];
        while ((c = getopt(argc, argv, "bvx")) != EOF) {
                switch (c) {
                case 'b':
                        bflag++;
                        break;
                case 'v':
                        vflag++;
                        break;
                case 'x':
                        xflag++;
                        break;
                }
        }

        argc -= optind;
        argv = &argv[optind];

        if ((argc < 2) || !prime(*argv)) {
                (void) fprintf(stderr,
                    gettext("%s: cannot initialize hash table\n"), prog);
                exit(1);
        }
        argc--;
        argv++;

        /* Select the correct suffix table */
        suftab = (bflag == 0) ? sufa : sufb;

/*
 *      if pass is not 1, it is assumed to be a filename.
 *      found words are written to this file.
 */
        pass = **argv;
        if (pass != '1')
                found = fopen(*argv, "w");

        for (;;) {
                affix[0] = 0;
                file = stdout;
                for (ep = word; (*ep = j = getchar()) != '\n'; ep++)
                        if (j == EOF)
                                exit(0);
/*
 *      here is the hyphen processing. these words were found in the stop
 *      list. however, if they exist as is, (no derivations tried) in the
 *      dictionary, let them through as correct.
 *
 */
                if (ep[-1] == '-') {
                        *--ep = 0;
                        if (!tryword(word, ep, 0))
                                (void) fprintf(file, "%s\n", word);
                        continue;
                }
                for (cp = word, dp = original; cp < ep; )
                        *dp++ = *cp++;
                *dp = 0;
                fold = 0;
                for (cp = word; cp < ep; cp++)
                        if (islower(*cp))
                                goto lcase;
                if (((ep - word) == 1) &&
                    ((word[0] == 'A') || (word[0] == 'I')))
                        continue;
                if (trypref(ep, ".", 0))
                        goto foundit;
                ++fold;
                for (cp = original+1, dp = word+1; dp < ep; dp++, cp++)
                        *dp = Tolower(*cp);
lcase:
                if (((ep - word) == 1) && (word[0] == 'a'))
                        continue;
                if (trypref(ep, ".", 0)||trysuff(ep, 0))
                        goto foundit;
                if (isupper(word[0])) {
                        for (cp = original, dp = word; *dp = *cp++; dp++)
                                if (fold) *dp = Tolower(*dp);
                        word[0] = Tolower(word[0]);
                        goto lcase;
                }
                (void) fprintf(file, "%s\n", original);
                continue;

foundit:
                if (pass == '1')
                        (void) fprintf(file, "%s-\n", original);
                else if (affix[0] != 0 && affix[0] != '.') {
                        file = found;
                        (void) fprintf(file, "%s\t%s\n", affix,
                            original);
                }
        }
}

/*
 *      strip exactly one suffix and do
 *      indicated routine(s), which may recursively
 *      strip suffixes
 */

static int
trysuff(char *ep, int lev)
{
        struct suftab   *t;
        char *cp, *sp;

        lev += DLEV;
        deriv[lev] = deriv[lev-1] = 0;
        for (t = &suftab[0]; (t != 0 && (sp = t->suf) != 0); t++) {
                cp = ep;
                while (*sp)
                        if (*--cp != *sp++)
                                goto next;
                for (sp = cp; --sp >= word && !vowel(*sp); )
                        ;
                if (sp < word)
                        return (0);
                if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1))
                        return (1);
                if (t->p2 != 0) {
                        deriv[lev] = deriv[lev+1] = 0;
                        return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev));
                }
                return (0);
next:;
        }
        return (0);
}

static int
nop(void)
{
        return (0);
}

/* ARGSUSED */
static int
strip(char *ep, char *d, char *a, int lev)
{
        return (trypref(ep, a, lev)||trysuff(ep, lev));
}

static int
s(char *ep, char *d, char *a, int lev)
{
        if (lev > DLEV+1)
                return (0);
        if (*ep == 's' && ep[-1] == 's')
                return (0);
        return (strip(ep, d, a, lev));
}

/* ARGSUSED */
static int
an(char *ep, char *d, char *a, int lev)
{
        if (!isupper(*word))    /* must be proper name */
                return (0);
        return (trypref(ep, a, lev));
}

/* ARGSUSED */
static int
ize(char *ep, char *d, char *a, int lev)
{
        ep[-1] = 'e';
        return (strip(ep, "", d, lev));
}

/* ARGSUSED */
static int
y_to_e(char *ep, char *d, char *a, int lev)
{
        *ep++ = 'e';
        return (strip(ep, "", d, lev));
}

static int
ily(char *ep, char *d, char *a, int lev)
{
        if (ep[-1] == 'i')
                return (i_to_y(ep, d, a, lev));
        else
                return (strip(ep, d, a, lev));
}

static int
bility(char *ep, char *d, char *a, int lev)
{
        *ep++ = 'l';
        return (y_to_e(ep, d, a, lev));
}

static int
i_to_y(char *ep, char *d, char *a, int lev)
{
        if (ep[-1] == 'i') {
                ep[-1] = 'y';
                a = d;
        }
        return (strip(ep, "", a, lev));
}

static int
es(char *ep, char *d, char *a, int lev)
{
        if (lev > DLEV)
                return (0);
        switch (ep[-1]) {
        default:
                return (0);
        case 'i':
                return (i_to_y(ep, d, a, lev));
        case 's':
        case 'h':
        case 'z':
        case 'x':
                return (strip(ep, d, a, lev));
        }
}

/* ARGSUSED */
static int
subst(char *ep, char *d, char *a, int lev)
{
        char *u, *t;

        if (skipv(skipv(ep-1)) < word)
                return (0);
        for (t = d; *t != '+'; t++)
                continue;
        for (u = ep; *--t != '-'; )
                *--u = *t;
        return (strip(ep, "", d, lev));
}


static int
tion(char *ep, char *d, char *a, int lev)
{
        switch (ep[-2]) {
        case 'c':
        case 'r':
                return (trypref(ep, a, lev));
        case 'a':
                return (y_to_e(ep, d, a, lev));
        }
        return (0);
}

/*      possible consonant-consonant-e ending */
static int
CCe(char *ep, char *d, char *a, int lev)
{
        switch (ep[-1]) {
        case 'r':
                if (ep[-2] == 't')
                        return (y_to_e(ep, d, a, lev));
                break;
        case 'l':
                if (vowel(ep[-2]))
                        break;
                switch (ep[-2]) {
                case 'l':
                case 'r':
                case 'w':
                        break;
                default:
                        return (y_to_e(ep, d, a, lev));
                }
                break;
        case 's':
                if (ep[-2] == 's')
                        break;
                if (*ep == 'a')
                        return (0);
                if (vowel(ep[-2]))
                        break;
                if (y_to_e(ep, d, a, lev))
                        return (1);
                if (!(ep[-2] == 'n' && ep[-1] == 'g'))
                        return (0);
                break;
        case 'c':
        case 'g':
                if (*ep == 'a')
                        return (0);
                if (vowel(ep[-2]))
                        break;
                if (y_to_e(ep, d, a, lev))
                        return (1);
                if (!(ep[-2] == 'n' && ep[-1] == 'g'))
                        return (0);
                break;
        case 'v':
        case 'z':
                if (vowel(ep[-2]))
                        break;
                if (y_to_e(ep, d, a, lev))
                        return (1);
                if (!(ep[-2] == 'n' && ep[-1] == 'g'))
                        return (0);
                break;
        case 'u':
                if (y_to_e(ep, d, a, lev))
                        return (1);
                if (!(ep[-2] == 'n' && ep[-1] == 'g'))
                        return (0);
                break;
        }
        return (VCe(ep, d, a, lev));
}

/*      possible consonant-vowel-consonant-e ending */
static int
VCe(char *ep, char *d, char *a, int lev)
{
        char c;
        c = ep[-1];
        if (c == 'e')
                return (0);
        if (!vowel(c) && vowel(ep[-2])) {
                c = *ep;
                *ep++ = 'e';
                if (trypref(ep, d, lev)||trysuff(ep, lev))
                        return (1);
                ep--;
                *ep = c;
        }
        return (strip(ep, d, a, lev));
}

static char *
lookuppref(char **wp, char *ep)
{
        char **sp;
        char *bp, *cp;

        for (sp = preftab; *sp; sp++) {
                bp = *wp;
                for (cp = *sp; *cp; cp++, bp++)
                        if (Tolower(*bp) != *cp)
                                goto next;
                for (cp = bp; cp < ep; cp++)
                        if (vowel(*cp)) {
                                *wp = bp;
                                return (*sp);
                        }
next:;
        }
        return (0);
}

/*
 *      while word is not in dictionary try stripping
 *      prefixes. Fail if no more prefixes.
 */
static int
trypref(char *ep, char *a, int lev)
{
        char *cp;
        char *bp;
        char *pp;
        int val = 0;
        char space[LINE_MAX * 2];
        deriv[lev] = a;
        if (tryword(word, ep, lev))
                return (1);
        bp = word;
        pp = space;
        deriv[lev+1] = pp;
        while (cp = lookuppref(&bp, ep)) {
                *pp++ = '+';
                while (*pp = *cp++)
                        pp++;
                if (tryword(bp, ep, lev+1)) {
                        val = 1;
                        break;
                }
        }
        deriv[lev+1] = deriv[lev+2] = 0;
        return (val);
}

static int
tryword(char *bp, char *ep, int lev)
{
        int i, j;
        char duple[3];
        if (ep-bp <= 1)
                return (0);
        if (vowel(*ep)) {
                if (monosyl(bp, ep))
                        return (0);
        }
        i = dict(bp, ep);
        if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) {
                ep--;
                deriv[++lev] = duple;
                duple[0] = '+';
                duple[1] = *ep;
                duple[2] = 0;
                i = dict(bp, ep);
        }
        if (vflag == 0 || i == 0)
                return (i);
        /*
         *      when derivations are wanted, collect them
         *      for printing
         */
        j = lev;
        do {
                if (deriv[j])
                        (void) strcat(affix, deriv[j]);
        } while (--j > 0);
        return (i);
}


static int
monosyl(char *bp, char *ep)
{
        if (ep < bp+2)
                return (0);
        if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
                return (0);
        while (--ep >= bp)
                if (vowel(*ep))
                        return (0);
        return (1);
}

static char *
skipv(char *s)
{
        if (s >= word&&vowel(*s))
                s--;
        while (s >= word && !vowel(*s))
                s--;
        return (s);
}

static int
vowel(int c)
{
        switch (Tolower(c)) {
        case 'a':
        case 'e':
        case 'i':
        case 'o':
        case 'u':
        case 'y':
                return (1);
        }
        return (0);
}

static int
dict(char *bp, char *ep)
{
        int temp, result;
        if (xflag)
                (void) fprintf(stdout, "=%.*s\n", ep-bp, bp);
        temp = *ep;
        *ep = 0;
        result = hashlook(bp);
        *ep = temp;
        return (result);
}
Illumos