root/usr/src/cmd/man/makewhatis.c
/*
 * Copyright (c) 2002 John Rochester
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer,
 *    in this position and unchanged.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
 * Copyright 2022 Oxide Computer Company
 */

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>

#include <ctype.h>
#include <dirent.h>
#include <err.h>
#include <signal.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "man.h"
#include "stringlist.h"


/* Information collected about each man page in a section */
struct page_info {
        char    *filename;
        char    *name;
        char    *suffix;
        ino_t   inode;
};

/* An expanding string */
struct sbuf {
        char    *content;       /* the start of the buffer */
        char    *end;           /* just past the end of the content */
        char    *last;          /* the last allocated character */
};

/* Remove the last amount characters from the sbuf */
#define sbuf_retract(sbuf, amount) ((sbuf)->end -= (amount))
/* Return the length of the sbuf content */
#define sbuf_length(sbuf) ((sbuf)->end - (sbuf)->content)

typedef char *edited_copy(char *from, char *to, int length);

/*
 * While the whatis line is being formed, it is stored in whatis_proto.
 * When finished, it is reformatted into whatis_final and then appended
 * to whatis_lines.
 */
static struct sbuf      *whatis_proto;
static struct sbuf      *whatis_final;
static stringlist       *whatis_lines;  /* collected output lines */

static char tempfile[MAXPATHLEN];       /* path of temporary file, if any */

#define MDOC_COMMANDS   "ArDvErEvFlLiNmPa"


/* Free a struct page_info and its content */
static void
free_page_info(struct page_info *info)
{

        free(info->filename);
        free(info->name);
        free(info->suffix);
        free(info);
}

/*
 * Allocate and fill in a new struct page_info given the
 * name of the man section directory and the dirent of the file.
 * If the file is not a man page, return NULL.
 */
static struct page_info *
new_page_info(char *dir, struct dirent *dirent)
{
        struct page_info *info;
        int             basename_length;
        char            *suffix;
        struct stat     st;

        if ((info = malloc(sizeof (struct page_info))) == NULL)
                err(1, "malloc");
        basename_length = strlen(dirent->d_name);
        suffix = &dirent->d_name[basename_length];
        if (asprintf(&info->filename, "%s/%s", dir, dirent->d_name) == -1)
                err(1, "asprintf");
        for (;;) {
                if (--suffix == dirent->d_name || !isalnum(*suffix)) {
                        if (*suffix == '.')
                                break;
                        free(info->filename);
                        free(info);
                        return (NULL);
                }
        }
        *suffix++ = '\0';
        info->name = strdup(dirent->d_name);
        info->suffix = strdup(suffix);
        if (stat(info->filename, &st) < 0) {
                warn("%s", info->filename);
                free_page_info(info);
                return (NULL);
        }
        if (!S_ISREG(st.st_mode)) {
                free_page_info(info);
                return (NULL);
        }
        info->inode = st.st_ino;
        return (info);
}

/*
 * Reset sbuf length to 0.
 */
static void
sbuf_clear(struct sbuf *sbuf)
{

        sbuf->end = sbuf->content;
}

/*
 * Allocate a new sbuf.
 */
static struct sbuf *
new_sbuf(void)
{
        struct sbuf     *sbuf;

        if ((sbuf = malloc(sizeof (struct sbuf))) == NULL)
                err(1, "malloc");
        if ((sbuf->content = (char *)malloc(LINE_ALLOC)) == NULL)
                err(1, "malloc");
        sbuf->last = sbuf->content + LINE_ALLOC - 1;
        sbuf_clear(sbuf);

        return (sbuf);
}

/*
 * Ensure that there is enough room in the sbuf
 * for nchars more characters.
 */
static void
sbuf_need(struct sbuf *sbuf, int nchars)
{
        char *new_content;
        size_t size, cntsize;
        size_t grow = 128;

        while (grow < nchars) {
                grow += 128;    /* we grow in chunks of 128 bytes */
        }

        /* Grow if the buffer isn't big enough */
        if (sbuf->end + nchars > sbuf->last) {
                size = sbuf->last + 1 - sbuf->content;
                size += grow;
                cntsize = sbuf->end - sbuf->content;

                if ((new_content = realloc(sbuf->content, size)) == NULL) {
                        perror("realloc");
                        if (tempfile[0] != '\0')
                                (void) unlink(tempfile);
                        exit(1);
                }
                sbuf->content = new_content;
                sbuf->end = new_content + cntsize;
                sbuf->last = new_content + size - 1;
        }
}

/*
 * Append a string of a given length to the sbuf.
 */
static void
sbuf_append(struct sbuf *sbuf, const char *text, int length)
{
        if (length > 0) {
                sbuf_need(sbuf, length);
                (void) memcpy(sbuf->end, text, length);
                sbuf->end += length;
        }
}

/*
 * Append a null-terminated string to the sbuf.
 */
static void
sbuf_append_str(struct sbuf *sbuf, char *text)
{

        sbuf_append(sbuf, text, strlen(text));
}

/*
 * Append an edited null-terminated string to the sbuf.
 */
static void
sbuf_append_edited(struct sbuf *sbuf, char *text, edited_copy copy)
{
        int     length;

        if ((length = strlen(text)) > 0) {
                sbuf_need(sbuf, length);
                sbuf->end = copy(text, sbuf->end, length);
        }
}

/*
 * Strip any of a set of chars from the end of the sbuf.
 */
static void
sbuf_strip(struct sbuf *sbuf, const char *set)
{

        while (sbuf->end > sbuf->content && strchr(set, sbuf->end[-1]) != NULL)
                sbuf->end--;
}

/*
 * Return the null-terminated string built by the sbuf.
 */
static char *
sbuf_content(struct sbuf *sbuf)
{

        *sbuf->end = '\0';
        return (sbuf->content);
}

/*
 * Return true if no man page exists in the directory with
 * any of the names in the stringlist.
 */
static int
no_page_exists(char *dir, stringlist *names, char *suffix)
{
        char    path[MAXPATHLEN];
        char    *suffixes[] = { "", ".gz", ".bz2", NULL };
        size_t  i;
        int     j;

        for (i = 0; i < names->sl_cur; i++) {
                for (j = 0; suffixes[j] != NULL; j++) {
                        (void) snprintf(path, MAXPATHLEN, "%s/%s.%s%s",
                            dir, names->sl_str[i], suffix, suffixes[j]);
                        if (access(path, F_OK) == 0) {
                                return (0);
                        }
                }
        }
        return (1);
}

/* ARGSUSED sig */
static void
trap_signal(int sig)
{

        if (tempfile[0] != '\0')
                (void) unlink(tempfile);

        exit(1);
}

/*
 * Attempt to open an output file.
 * Return NULL if unsuccessful.
 */
static FILE *
open_output(char *name)
{
        FILE    *output;

        whatis_lines = sl_init();
        (void) snprintf(tempfile, MAXPATHLEN, "%s.tmp", name);
        name = tempfile;
        if ((output = fopen(name, "w")) == NULL) {
                warn("%s", name);
                return (NULL);
        }
        return (output);
}

static int
linesort(const void *a, const void *b)
{

        return (strcmp((*(const char * const *)a), (*(const char * const *)b)));
}

/*
 * Write the unique sorted lines to the output file.
 */
static void
finish_output(FILE *output, char *name)
{
        size_t  i;
        char    *prev = NULL;

        qsort(whatis_lines->sl_str, whatis_lines->sl_cur, sizeof (char *),
            linesort);
        for (i = 0; i < whatis_lines->sl_cur; i++) {
                char *line = whatis_lines->sl_str[i];
                if (i > 0 && strcmp(line, prev) == 0)
                        continue;
                prev = line;
                (void) fputs(line, output);
                (void) putc('\n', output);
        }
        (void) fclose(output);
        sl_free(whatis_lines, 1);
        (void) rename(tempfile, name);
        (void) unlink(tempfile);
}

static FILE *
open_whatis(char *mandir)
{
        char    filename[MAXPATHLEN];

        (void) snprintf(filename, MAXPATHLEN, "%s/%s", mandir, WHATIS);
        return (open_output(filename));
}

static void
finish_whatis(FILE *output, char *mandir)
{
        char    filename[MAXPATHLEN];

        (void) snprintf(filename, MAXPATHLEN, "%s/%s", mandir, WHATIS);
        finish_output(output, filename);
}

/*
 * Remove trailing spaces from a string, returning a pointer to just
 * beyond the new last character.
 */
static char *
trim_rhs(char *str)
{
        char    *rhs;

        rhs = &str[strlen(str)];
        while (--rhs > str && isspace(*rhs))
                ;
        *++rhs = '\0';
        return (rhs);
}

/*
 * Return a pointer to the next non-space character in the string.
 */
static char *
skip_spaces(char *s)
{

        while (*s != '\0' && isspace(*s))
                s++;

        return (s);
}

/*
 * Return whether the line is of one of the forms:
 *      .Sh NAME
 *      .Sh "NAME"
 *      etc.
 * assuming that section_start is ".Sh".
 */
static int
name_section_line(char *line, const char *section_start)
{
        char            *rhs;

        if (strncmp(line, section_start, 3) != 0)
                return (0);
        line = skip_spaces(line + 3);
        rhs = trim_rhs(line);
        if (*line == '"') {
                line++;
                if (*--rhs == '"')
                        *rhs = '\0';
        }
        if (strcmp(line, "NAME") == 0)
                return (1);

        return (0);
}

/*
 * Copy characters while removing the most common nroff/troff markup:
 *      \(em, \(mi, \s[+-N], \&
 *      \fF, \f(fo, \f[font]
 *      \*s, \*(st, \*[stringvar]
 */
static char *
de_nroff_copy(char *from, char *to, int fromlen)
{
        char    *from_end = &from[fromlen];

        while (from < from_end) {
                switch (*from) {
                case '\\':
                        switch (*++from) {
                        case '(':
                                if (strncmp(&from[1], "em", 2) == 0 ||
                                    strncmp(&from[1], "mi", 2) == 0) {
                                        from += 3;
                                        continue;
                                }
                                break;
                        case 's':
                                if (*++from == '-')
                                        from++;
                                while (isdigit(*from))
                                        from++;
                                continue;
                        case 'f':
                        case '*':
                                if (*++from == '(') {
                                        from += 3;
                                } else if (*from == '[') {
                                        while (*++from != ']' &&
                                            from < from_end)
                                                ;
                                        from++;
                                } else {
                                        from++;
                                }
                                continue;
                        case '&':
                                from++;
                                continue;
                        }
                        break;
                }
                *to++ = *from++;
        }
        return (to);
}

/*
 * Append a string with the nroff formatting removed.
 */
static void
add_nroff(char *text)
{

        sbuf_append_edited(whatis_proto, text, de_nroff_copy);
}

/*
 * Appends "name(suffix), " to whatis_final
 */
static void
add_whatis_name(char *name, char *suffix)
{

        if (*name != '\0') {
                sbuf_append_str(whatis_final, name);
                sbuf_append(whatis_final, "(", 1);
                sbuf_append_str(whatis_final, suffix);
                sbuf_append(whatis_final, "), ", 3);
        }
}

/*
 * Processes an old-style man(7) line. This ignores commands with only
 * a single number argument.
 */
static void
process_man_line(char *line)
{
        char    *p;

        if (*line == '.') {
                while (isalpha(*++line))
                        ;
                p = line = skip_spaces(line);
                while (*p != '\0') {
                        if (!isdigit(*p))
                                break;
                        p++;
                }
                if (*p == '\0')
                        return;
        } else
                line = skip_spaces(line);
        if (*line != '\0') {
                add_nroff(line);
                sbuf_append(whatis_proto, " ", 1);
        }
}

/*
 * Processes a new-style mdoc(7) line.
 */
static void
process_mdoc_line(char *line)
{
        int     xref;
        int     arg = 0;
        char    *line_end = &line[strlen(line)];
        int     orig_length = sbuf_length(whatis_proto);
        char    *next;

        if (*line == '\0')
                return;
        if (line[0] != '.' || !isupper(line[1]) || !islower(line[2])) {
                add_nroff(skip_spaces(line));
                sbuf_append(whatis_proto, " ", 1);
                return;
        }
        xref = strncmp(line, ".Xr", 3) == 0;
        line += 3;
        while ((line = skip_spaces(line)) < line_end) {
                if (*line == '"') {
                        next = ++line;
                        for (;;) {
                                next = strchr(next, '"');
                                if (next == NULL)
                                        break;
                                (void) memmove(next, next + 1, strlen(next));
                                line_end--;
                                if (*next != '"')
                                        break;
                                next++;
                        }
                } else {
                        next = strpbrk(line, " \t");
                }
                if (next != NULL)
                        *next++ = '\0';
                else
                        next = line_end;
                if (isupper(*line) && islower(line[1]) && line[2] == '\0') {
                        if (strcmp(line, "Ns") == 0) {
                                arg = 0;
                                line = next;
                                continue;
                        }
                        if (strstr(line, MDOC_COMMANDS) != NULL) {
                                line = next;
                                continue;
                        }
                }
                if (arg > 0 && strchr(",.:;?!)]", *line) == 0) {
                        if (xref) {
                                sbuf_append(whatis_proto, "(", 1);
                                add_nroff(line);
                                sbuf_append(whatis_proto, ")", 1);
                                xref = 0;
                        } else {
                                sbuf_append(whatis_proto, " ", 1);
                        }
                }
                add_nroff(line);
                arg++;
                line = next;
        }
        if (sbuf_length(whatis_proto) > orig_length)
                sbuf_append(whatis_proto, " ", 1);
}

/*
 * Collect a list of comma-separated names from the text.
 */
static void
collect_names(stringlist *names, char *text)
{
        char    *arg;

        for (;;) {
                arg = text;
                text = strchr(text, ',');
                if (text != NULL)
                        *text++ = '\0';
                (void) sl_add(names, arg);
                if (text == NULL)
                        return;
                if (*text == ' ')
                        text++;
        }
}

enum { STATE_UNKNOWN, STATE_MANSTYLE, STATE_MDOCNAME, STATE_MDOCDESC };

/*
 * Process a man page source into a single whatis line and add it
 * to whatis_lines.
 */
static void
process_page(struct page_info *page, char *section_dir)
{
        FILE            *fp;
        stringlist      *names;
        char            *descr;
        int             state = STATE_UNKNOWN;
        size_t          i;
        char            *line = NULL;
        size_t          linecap = 0;

        sbuf_clear(whatis_proto);
        if ((fp = fopen(page->filename, "r")) == NULL) {
                warn("%s", page->filename);
                return;
        }
        while (getline(&line, &linecap, fp) > 0) {
                /* Skip comments */
                if (strncmp(line, ".\\\"", 3) == 0)
                        continue;
                switch (state) {
                /* Haven't reached the NAME section yet */
                case STATE_UNKNOWN:
                        if (name_section_line(line, ".SH"))
                                state = STATE_MANSTYLE;
                        else if (name_section_line(line, ".Sh"))
                                state = STATE_MDOCNAME;
                        continue;
                /* Inside an old-style .SH NAME section */
                case STATE_MANSTYLE: {
                        char *altline;

                        if (strncmp(line, ".SH", 3) == 0 ||
                            strncmp(line, ".SS", 3) == 0)
                                break;
                        (void) trim_rhs(line);
                        if (strcmp(line, ".") == 0)
                                continue;
                        altline = line;
                        if (strncmp(altline, ".IX", 3) == 0) {
                                altline += 3;
                                altline = skip_spaces(altline);
                        }
                        process_man_line(altline);
                        continue;
                }
                /* Inside a new-style .Sh NAME section (the .Nm part) */
                case STATE_MDOCNAME:
                        (void) trim_rhs(line);
                        if (strncmp(line, ".Nm", 3) == 0) {
                                process_mdoc_line(line);
                                continue;
                        } else {
                                if (strcmp(line, ".") == 0)
                                        continue;
                                sbuf_append(whatis_proto, "- ", 2);
                                state = STATE_MDOCDESC;
                        }
                        /* FALLTHROUGH */
                /* Inside a new-style .Sh NAME section (after the .Nm-s) */
                case STATE_MDOCDESC:
                        if (strncmp(line, ".Sh", 3) == 0)
                                break;
                        (void) trim_rhs(line);
                        if (strcmp(line, ".") == 0)
                                continue;
                        process_mdoc_line(line);
                        continue;
                }
                break;
        }
        (void) fclose(fp);
        sbuf_strip(whatis_proto, " \t.-");
        line = sbuf_content(whatis_proto);
        /*
         * Line now contains the appropriate data, but without the
         * proper indentation or the section appended to each name.
         */
        descr = strstr(line, " - ");
        if (descr == NULL) {
                descr = strchr(line, ' ');
                if (descr == NULL)
                        return;
                *descr++ = '\0';
        } else {
                *descr = '\0';
                descr += 3;
        }
        names = sl_init();
        collect_names(names, line);
        sbuf_clear(whatis_final);
        if (!sl_find(names, page->name) &&
            no_page_exists(section_dir, names, page->suffix)) {
                /*
                 * Add the page name since that's the only
                 * thing that man(1) will find.
                 */
                add_whatis_name(page->name, page->suffix);
        }
        for (i = 0; i < names->sl_cur; i++)
                add_whatis_name(names->sl_str[i], page->suffix);
        sl_free(names, 0);
        /* Remove last ", " */
        sbuf_retract(whatis_final, 2);
        while (sbuf_length(whatis_final) < INDENT)
                sbuf_append(whatis_final, " ", 1);
        sbuf_append(whatis_final, " - ", 3);
        sbuf_append_str(whatis_final, skip_spaces(descr));
        (void) sl_add(whatis_lines, strdup(sbuf_content(whatis_final)));
}

/*
 * Sort pages first by inode number, then by name.
 */
static int
pagesort(const void *a, const void *b)
{
        const struct page_info *p1 = *(struct page_info * const *) a;
        const struct page_info *p2 = *(struct page_info * const *) b;

        if (p1->inode == p2->inode)
                return (strcmp(p1->name, p2->name));

        return (p1->inode - p2->inode);
}

/*
 * Process a single man section.
 */
static void
process_section(char *section_dir)
{
        struct dirent   **entries;
        int             nentries;
        struct page_info **pages;
        int             npages = 0;
        int             i;
        ino_t           prev_inode = 0;

        /* Scan the man section directory for pages */
        nentries = scandir(section_dir, &entries, NULL, alphasort);

        /* Collect information about man pages */
        pages = (struct page_info **)calloc(nentries,
            sizeof (struct page_info *));
        for (i = 0; i < nentries; i++) {
                struct page_info *info = new_page_info(section_dir, entries[i]);
                if (info != NULL)
                        pages[npages++] = info;
                free(entries[i]);
        }
        free(entries);
        qsort(pages, npages, sizeof (struct page_info *), pagesort);

        /* Process each unique page */
        for (i = 0; i < npages; i++) {
                struct page_info *page = pages[i];
                if (page->inode != prev_inode) {
                        prev_inode = page->inode;
                        process_page(page, section_dir);
                }
                free_page_info(page);
        }
        free(pages);
}

/*
 * Return whether the directory entry is a man page section.
 */
static int
select_sections(const struct dirent *entry)
{
        const char      *p = &entry->d_name[3];

        if (strncmp(entry->d_name, "man", 3) != 0)
                return (0);
        while (*p != '\0') {
                if (!isalnum(*p++))
                        return (0);
        }
        return (1);
}

/*
 * Process a single top-level man directory by finding all the
 * sub-directories named man* and processing each one in turn.
 */
void
mwpath(char *path)
{
        FILE            *fp = NULL;
        struct dirent   **entries;
        int             nsections;
        int             i;

        (void) signal(SIGINT, trap_signal);
        (void) signal(SIGHUP, trap_signal);
        (void) signal(SIGQUIT, trap_signal);
        (void) signal(SIGTERM, trap_signal);

        whatis_proto = new_sbuf();
        whatis_final = new_sbuf();

        nsections = scandir(path, &entries, select_sections, alphasort);
        if ((fp = open_whatis(path)) == NULL)
                return;
        for (i = 0; i < nsections; i++) {
                char    section_dir[MAXPATHLEN];

                (void) snprintf(section_dir, MAXPATHLEN, "%s/%s",
                    path, entries[i]->d_name);
                process_section(section_dir);
                free(entries[i]);
        }
        free(entries);
        finish_whatis(fp, path);
}