root/usr/src/cmd/sort/fields.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include "fields.h"

/*
 * fields
 *
 * Overview
 *   By a field, we mean the various delimited character sequences within each
 *   line of the input files.  The sort key consists of an ordered sequence of
 *   fields, which need not include all possible fields for the given line.
 *   (Furthermore, not every line need contain sufficient fields for the fields
 *   given within the sort key.  In fact, none of the lines in the input stream
 *   need contain sufficient fields.)
 *
 *   There are two methods for specifying fields for sort(1); these are
 *   discussed in options.c.  Here we discuss only the internal representation
 *   of fields, as used for constructing the collation vector for each line as
 *   defined by the sort key.
 *
 * Representation
 *   The sort key is a singly-linked list of field specifiers.  At present,
 *   fields may belong to one of three species:  alphabetical, numerical, or
 *   monthly; the species (f_species) then indicates the conversion function
 *   (f_convert) used to transform the raw characters of the character sequence
 *   to a collatable form.  (In principle, this allows us to consider future
 *   field species such as hexadecimal.)
 *
 *   Fields and offsets are numbered such that zero refers to the first field or
 *   character, respectively.  Thus, the interpretation of a key specifier, m.n,
 *   is that the field begins at the nth character beyond the mth occurence of
 *   the key separator.  If the blanks flag has been specified, then the field
 *   begins at the nth non-blank character past the mth key separator.  If the
 *   key separator is unspecified, then the key separator is defined as one or
 *   more blank characters.
 *
 *   In general, the various options afforded by sort may be broken into two
 *   categories:  field species and field modifiers.  For each field species,
 *   there is one or more conversion routines that take a delimited character
 *   sequence and convert it to a character sequence collatable by strcmp() or
 *   memcmp().  For field species that may be further modified, such as the
 *   fold-to-uppercase option for alphabetic fields, the conversion routine may
 *   be aware of how the modifier affects collation.  Finally, the no-modifiers
 *   case may present an opportunity for a simplified, faster version.
 *
 * Code Structure
 *   The code paths for single-byte and multi-byte locales diverge significantly
 *   in fields.c.  Most routines have an *_wide() version, which produces an
 *   equivalent effect for line records whose data field is composed of wide
 *   characters (wchar_t).  However, the l_collated field of a line record is
 *   always composed of characters, so that the radix sorts provided in
 *   internal.c can work in both single- and multi-byte locales.  Thus, in the
 *   various convert_*_wide() routines, the output is placed in l_collated, with
 *   a length multiplier of 4.
 */

#define BEFORE_NUMBER   0x0
#define IN_NUMBER       0x1

static char     numerical_separator;
static char     numerical_decimal;
static char     monetary_separator;
static char     monetary_decimal;

static wchar_t  w_numerical_separator;
static wchar_t  w_numerical_decimal;
static wchar_t  w_monetary_separator;
static wchar_t  w_monetary_decimal;

#define MONTHS_IN_YEAR  12
#define MAX_MON_LEN     20

enum { MO_NONE = 1, MO_OFFSET = 2 };

static char     *months[MONTHS_IN_YEAR];
static size_t   month_lengths[MONTHS_IN_YEAR];
static wchar_t  *w_months[MONTHS_IN_YEAR];
static size_t   w_month_lengths[MONTHS_IN_YEAR];

#define DECIMAL_CHAR            (numerical_decimal)
#define IS_BLANK(x)             (isspace((uchar_t)(x)) && (x) != '\n')
#define IS_SEPARATOR(x)         \
        ((numerical_separator != '\0' && (x) == numerical_separator) || \
        (monetary_separator != '\0' && (x) == monetary_separator))
#define IS_DECIMAL(x)           \
        ((x) == numerical_decimal || \
        (monetary_decimal != '\0' && (x) == monetary_decimal))
#define W_DECIMAL_CHAR          (w_numerical_decimal)
#define W_IS_BLANK(x)           (iswspace(x) && (x) != L'\n')
#define W_IS_SEPARATOR(x)       \
        ((numerical_separator != '\0' && (x) == w_numerical_separator) || \
        (monetary_separator != '\0' && (x) == w_monetary_separator))
#define W_IS_DECIMAL(x)         \
        (((x) == w_numerical_decimal) || \
        (monetary_decimal != '\0' && (x) == w_monetary_decimal))

#define INTERFIELD_SEPARATOR '\0'
#define W_INTERFIELD_SEPARATOR L'\0'

#define INT_SIGN_FLIP_MASK 0x80000000
#define INT_SIGN_PASS_MASK 0x00000000

/*
 * strx_ops_t, xfrm_len, and xfrm_cpy:  In the case where we are sorting in the
 * C locale, we want to avoid the expense of transforming strings to collatable
 * forms since, by definition, an arbitrary string in the C locale is already in
 * its collatable form.  Therefore, we construct a small ops vector (the
 * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into
 * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear
 * strxfrm()-like.
 */
/*ARGSUSED*/
static size_t
xfrm_len(const char *s2, size_t len)
{
        return (strxfrm(NULL, s2, 0) + 1);
}

/*
 * The length represented by n includes a null character, so to return the
 * correct length we subtract 1.  Note that this function is only used by
 * field_convert_alpha, and isn't for general use, as it assumes that n is the
 * length of s2 plus a null character.
 */
static size_t
C_ncpy(char *s1, const char *s2, size_t n)
{
        (void) strncpy(s1, s2, n);
        return (n - 1);
}

/*ARGSUSED*/
static size_t
C_len(const char *s, size_t len)
{
        ASSERT(s != NULL);
        return (len);
}

typedef struct _strx_ops {
        size_t  (*sx_len)(const char *, size_t);
        size_t  (*sx_xfrm)(char *, const char *, size_t);
} strx_ops_t;

static const strx_ops_t C_ops = { C_len, C_ncpy };
static const strx_ops_t SB_ops = { xfrm_len, strxfrm };

static const strx_ops_t *xfrm_ops;

static void
field_initialize_separator(void)
{
        /*
         * A locale need not define all of the cases below:  only decimal_point
         * must be defined.  Furthermore, sort(1) has traditionally not used the
         * positive_sign and negative_sign, grouping, or currency_symbols (or
         * their numeric counterparts, if any).
         */
        struct lconv *conv = localeconv();

        if (!xstreql(conv->thousands_sep, "")) {
                numerical_separator = *conv->thousands_sep;
                (void) mbtowc(&w_numerical_separator, conv->thousands_sep,
                    MB_CUR_MAX);
        } else
                numerical_separator = '\0';

        if (!xstreql(conv->mon_thousands_sep, "")) {
                monetary_separator = *conv->mon_thousands_sep;
                (void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep,
                    MB_CUR_MAX);
        } else
                monetary_separator = '\0';

        if (!xstreql(conv->mon_decimal_point, "")) {
                monetary_decimal = *conv->mon_decimal_point;
                (void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point,
                    MB_CUR_MAX);
        } else
                monetary_decimal = '\0';

        numerical_decimal = *conv->decimal_point;
        (void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX);
}

static void
field_initialize_month(int is_c_locale)
{
        int i;
        int j;
        struct tm this_month;
        const char *c_months[MONTHS_IN_YEAR] = {
                "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
                "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
        };

        char month_name[MAX_MON_LEN * MB_LEN_MAX];
        wchar_t w_month_name[MAX_MON_LEN];

        if (is_c_locale) {
                for (i = 0; i < MONTHS_IN_YEAR; i++) {
                        months[i] = (char *)c_months[i];
                        month_lengths[i] = strlen(c_months[i]);
                }
                /*
                 * We don't need to initialize the wide version of the month
                 * names.
                 */
                return;
        }

        (void) memset(&this_month, 0, sizeof (this_month));

        for (i = 0; i < MONTHS_IN_YEAR; i++) {
                this_month.tm_mon = i;

                (void) strftime(month_name, sizeof (month_name),
                    "%b", &this_month);

                for (j = 0; j < strlen(month_name); j++)
                        month_name[j] = toupper(month_name[j]);
                (void) mbstowcs(w_month_name, month_name, MAX_MON_LEN);

                months[i] = strdup(month_name);
                month_lengths[i] = strlen(month_name);
                w_months[i] = wsdup(w_month_name);
                w_month_lengths[i] = wslen(w_month_name);
        }
}

void
field_initialize(sort_t *S)
{
        field_initialize_month(S->m_c_locale);
        field_initialize_separator();

        if (S->m_c_locale)
                xfrm_ops = &C_ops;
        else
                xfrm_ops = &SB_ops;
}

field_t *
field_new(sort_t *S)
{
        field_t *F = safe_realloc(NULL, sizeof (field_t));

        F->f_start_field = -1;
        F->f_start_offset = -1;
        F->f_end_field = -1;
        F->f_end_offset = -1;
        F->f_next = NULL;

        if (S == NULL) {
                F->f_species = ALPHA;
                F->f_options = 0;
        } else {
                F->f_species = S->m_default_species;
                F->f_options = S->m_field_options;
        }

        return (F);
}

void
field_delete(field_t *F)
{
        free(F);
}

/*
 * The recursive implementation of field_add_to_chain() given below is
 * inappropriate if function calls are expensive, or a truly large number of
 * fields are anticipated.
 */
void
field_add_to_chain(field_t **F, field_t *A)
{
        if (*F == NULL)
                *F = A;
        else
                field_add_to_chain(&((*F)->f_next), A);
}

#ifdef DEBUG
#ifndef _LP64
#define FIELD_FMT \
"\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n"
#else /* !_LP64 */
#define FIELD_FMT \
"\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n"
#endif /* !_LP64 */

/*
 * field_print is used only for debugging purposes.
 */
void
field_print(field_t *F)
{
        char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"};
        int status = 0;

        (void) fprintf(stderr, "Type: %s", field_names[F->f_species]);
        (void) fprintf(stderr, "\tOptions: ");

        if (F->f_options & FIELD_REVERSE_COMPARISONS) {
                (void) fprintf(stderr, "REVERSE");
                status++;
        }
        if (F->f_options & FIELD_DICTIONARY_ORDER) {
                (void) fprintf(stderr, "DICTIONARY ");
                status++;
        }
        if (F->f_options & FIELD_FOLD_UPPERCASE) {
                (void) fprintf(stderr, "UPPERCASE ");
                status++;
        }
        if (F->f_options & FIELD_IGNORE_NONPRINTABLES) {
                (void) fprintf(stderr, "PRINTABLES ");
                status++;
        }
        if (F->f_options & FIELD_IGNORE_BLANKS_START) {
                (void) fprintf(stderr, "BLANKS_START ");
                status++;
        }
        if (F->f_options & FIELD_IGNORE_BLANKS_END) {
                (void) fprintf(stderr, "BLANKS_END ");
                status++;
        }

        if (status == 0)
                (void) fprintf(stderr, "NO_MODIFIERS");

        (void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset,
            F->f_end_field, F->f_end_offset);
}
#endif /* DEBUG */

static ssize_t
field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks)
{
        char *S = L->l_data.sp;
        char *T = S;
        char *eol = S + L->l_data_length;
        ssize_t field = is_end ? F->f_end_field : F->f_start_field;
        ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
        ssize_t ret;

        ASSERT(is_end || field > -1);

        if (is_end && field == -1)
                return (L->l_data_length);

        while (field-- > 0) {
                while (T < eol && IS_BLANK(*T))
                        T++;

                while (T < eol && !IS_BLANK(*T))
                        T++;
        }

        if ((!is_end || offset > 0) && is_blanks) {
                while (IS_BLANK(*T))
                        T++;
        }

        if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
                return (L->l_data_length);

        return (ret);
}

static void
field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
{
        ASSERT(F->f_start_field > -1);

        *start = field_boundary(F, L, 0,
            F->f_options & FIELD_IGNORE_BLANKS_START);
        *end = field_boundary(F, L, 1,
            F->f_options & FIELD_IGNORE_BLANKS_END);
}

static ssize_t
field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks)
{
        wchar_t *S = L->l_data.wp;
        wchar_t *T = S;
        wchar_t *eol = S + L->l_data_length;
        ssize_t field = is_end ? F->f_end_field : F->f_start_field;
        ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
        ssize_t ret;

        ASSERT(is_end || field > -1);

        if (is_end && field == -1)
                return (L->l_data_length);

        while (field-- > 0) {
                while (T < eol && W_IS_BLANK(*T))
                        T++;

                while (T < eol && !W_IS_BLANK(*T))
                        T++;
        }

        if ((!is_end || offset > 0) && is_blanks) {
                while (W_IS_BLANK(*T))
                        T++;
        }

        if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
                return (L->l_data_length);

        return (ret);
}

static void
field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
{
        ASSERT(F->f_start_field > -1);

        *start = field_boundary_wide(F, L, 0,
            F->f_options & FIELD_IGNORE_BLANKS_START);
        *end = field_boundary_wide(F, L, 1,
            F->f_options & FIELD_IGNORE_BLANKS_END);
}

static ssize_t
field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks,
    vchar_t delimiter)
{
        char *S = L->l_data.sp;
        char *T = S;
        char *eol = S + L->l_data_length;
        ssize_t field = is_end ? F->f_end_field : F->f_start_field;
        ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
        ssize_t ret;

        ASSERT(is_end || field > -1);

        if (is_end && field == -1)
                return (L->l_data_length);

        while (field-- > 0) {
                T = xstrnchr(T, delimiter.sc, eol - T);
                if (T == NULL || T > eol)
                        return (L->l_data_length);

                T++;
        }

        if ((!is_end || offset != 0) && is_blanks) {
                while (IS_BLANK(*T))
                        T++;
        }

        if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
                if (L->l_data_length <= 0)
                        return (0);
                if (S[L->l_data_length - 1] == delimiter.sc) {
                        return (L->l_data_length - 1);
                } else {
                        return (L->l_data_length);
                }
        }

        if (is_end && offset == 0)
                ret--;

        return (ret);
}

/*
 * field_delimit_tabbed() is called when a field separator has been defined
 * using the -t option.  The character at the offset, start, is either one or
 * more character positions past the delimiter marking the start of the
 * field, or at the end of the line.
 */
static void
field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end,
    vchar_t delimiter)
{
        ASSERT(F->f_start_field > -1);

        *start = field_boundary_tabbed(F, L, 0, F->f_options &
            FIELD_IGNORE_BLANKS_START, delimiter);
        *end = field_boundary_tabbed(F, L, 1, F->f_options &
            FIELD_IGNORE_BLANKS_END, delimiter);
}

static ssize_t
field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks,
    vchar_t delimiter)
{
        wchar_t *S = L->l_data.wp;
        wchar_t *T = S;
        wchar_t *eol = S + L->l_data_length;
        ssize_t field = is_end ? F->f_end_field : F->f_start_field;
        ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
        ssize_t ret;

        ASSERT(is_end || field > -1);

        if (is_end && field == -1)
                return (L->l_data_length);

        while (field-- > 0) {
                T = xwsnchr(T, delimiter.wc, eol - T);
                if (T == NULL || T > eol)
                        return (L->l_data_length);

                T++;
        }

        if ((!is_end || offset != 0) && is_blanks) {
                while (W_IS_BLANK(*T))
                        T++;
        }

        if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
                if (L->l_data_length <= 0)
                        return (0);
                if (S[L->l_data_length - 1] == delimiter.wc) {
                        return (L->l_data_length - 1);
                } else {
                        return (L->l_data_length);
                }
        }

        if (is_end && offset == 0)
                ret--;

        return (ret);
}

static void
field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start,
    ssize_t *end, vchar_t delimiter)
{
        ASSERT(F->f_start_field > -1);

        *start = field_boundary_tabbed_wide(F, L, 0, F->f_options &
            FIELD_IGNORE_BLANKS_START, delimiter);
        *end = field_boundary_tabbed_wide(F, L, 1, F->f_options &
            FIELD_IGNORE_BLANKS_END, delimiter);
}

/*ARGSUSED*/
ssize_t
field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
        int j;
        ssize_t val;
        char month_candidate[MAX_MON_LEN * MB_LEN_MAX];
        ssize_t month_length = data_length;
        ssize_t month_offset = data_offset;

        if (sizeof (char) > L->l_collate_bufsize - coll_offset)
                return (-1);

        (void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX);


        /*
         * The month field formally begins with the first non-blank character.
         */
        while (IS_BLANK(*(L->l_data.sp + month_offset))) {
                month_offset++;
                month_length--;
        }

        for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
                month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]);

        for (j = 0; j < MONTHS_IN_YEAR; j++) {
                if (xstrneql(month_candidate, months[j], month_lengths[j])) {
                        *(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET;
                        return (1);
                }
        }

        /*
         * no matching month; copy string into field.  required behaviour is
         * that "month-free" keys sort before month-sortable keys, so insert
         * a "will sort first" token.
         */
        *(L->l_collate.sp + coll_offset) = '\0' + MO_NONE;

        val = field_convert_alpha_simple(F, L, delimiter, data_offset,
            data_length, coll_offset + 1);

        if (val < 0)
                return (-1);
        else
                return (val + 1);
}

/*ARGSUSED*/
ssize_t
field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
        ssize_t j;
        ssize_t val;
        wchar_t month_candidate[MAX_MON_LEN];
        wchar_t *month;
        wchar_t *buffer = L->l_collate.wp + coll_offset;
        ssize_t month_length = data_length;
        ssize_t month_offset = data_offset;

        if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) <
            sizeof (wchar_t))
                return (-1);

        (void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t));


        while (W_IS_BLANK(*(L->l_data.wp + month_offset))) {
                month_offset++;
                month_length--;
        }

        month = L->l_data.wp + month_offset;

        for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
                month_candidate[j] = towupper(month[j]);

        for (j = 0; j < MONTHS_IN_YEAR; j++)
                if (xwcsneql(month_candidate, w_months[j],
                    w_month_lengths[j])) {
                        *buffer = L'\0' + j + MO_OFFSET;
                        return (1);
                }

        *buffer = L'\0' + MO_NONE;

        val = field_convert_alpha_wide(F, L, delimiter, data_offset,
            data_length, coll_offset + sizeof (wchar_t));

        if (val < 0)
                return (-1);
        else
                return (val + 1);
}

/*
 * field_convert_alpha() always fails with return value -1 if the converted
 * string would cause l_collate_length to exceed l_collate_bufsize
 */
/*ARGSUSED*/
ssize_t
field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
        static char *compose;
        static ssize_t compose_length;

        ssize_t clength = 0;
        ssize_t dlength;
        ssize_t i;

        if (compose_length < (data_length + 1)) {
                compose_length = data_length + 1;
                compose = safe_realloc(compose, compose_length * sizeof (char));
        }

        for (i = data_offset; i < data_offset + data_length; i++) {
                char t = (L->l_data.sp)[i];

                if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) &&
                    !isprint((uchar_t)t))
                        continue;

                if ((F->f_options & FIELD_DICTIONARY_ORDER) &&
                    !isalnum((uchar_t)t) && !isspace((uchar_t)t))
                        continue;

                if (F->f_options & FIELD_FOLD_UPPERCASE)
                        t = toupper(t);

                compose[clength++] = t;
        }
        compose[clength] = '\0';

        if ((dlength = xfrm_ops->sx_len(compose, clength)) <
            L->l_collate_bufsize - coll_offset)
                return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
                    compose, dlength + 1));
        else
                return ((ssize_t)-1);
}

/*ARGSUSED*/
ssize_t
field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
        static char *compose;
        static ssize_t compose_length;

        ssize_t clength;
        ssize_t dlength;

        if (compose_length < (data_length + 1)) {
                compose_length = data_length + 1;
                compose = safe_realloc(compose, compose_length * sizeof (char));
        }

        (void) memcpy(compose, L->l_data.sp + data_offset, data_length);
        clength = data_length;
        compose[clength] = '\0';

        if ((dlength = xfrm_ops->sx_len(compose, clength)) <
            L->l_collate_bufsize - coll_offset)
                return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
                    compose, dlength + 1));
        else
                return ((ssize_t)-1);
}

/*ARGSUSED*/
ssize_t
field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
        wchar_t *compose = safe_realloc(NULL, (data_length + 1) *
            sizeof (wchar_t));
        ssize_t clength = 0;
        ssize_t dlength;
        ssize_t i;
        ssize_t ret;

        for (i = data_offset; i < data_offset + data_length; i++) {
                wchar_t t = (L->l_data.wp)[i];

                if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t))
                        continue;

                if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) &&
                    !iswspace(t))
                        continue;

                if (F->f_options & FIELD_FOLD_UPPERCASE)
                        t = towupper(t);

                compose[clength++] = t;
        }
        compose[clength] = L'\0';

        dlength = wcsxfrm(NULL, compose, (size_t)0);
        if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize -
            coll_offset * sizeof (wchar_t)) {
                ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose,
                    (size_t)dlength + 1);
        } else {
                ret = (ssize_t)-1;
        }

        safe_free(compose);

        return (ret);
}

/*
 * field_convert_numeric() converts the given field into a collatable numerical
 * sequence.  The sequence is ordered as { log, integer, separator, fraction },
 * with an optional sentinel component at the sequence end.
 */
/*ARGSUSED*/
ssize_t
field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
        char *number;
        char *buffer = L->l_collate.sp + coll_offset;
        ssize_t length;

        char sign = '2';
        int log_ten;
        char *digits = buffer + 1 + sizeof (int) / sizeof (char);
        size_t j = 0;
        size_t i;

        int state = BEFORE_NUMBER;

        number = L->l_data.sp + data_offset;
        length = data_length;

        /*
         * Eat leading blanks, if any.
         */
        for (i = 0; i < length; i++)
                if (!IS_BLANK(number[i]))
                        break;

        /*
         * Test that there is sufficient size in the collation buffer for our
         * number.  In addition to the possible remaining characters in the
         * field, we also require space for the sign (char), logarithm (int),
         * separator (char), and as many as two string terminators (for reverse
         * sorts).
         */
        if (((length - i) + 4 * sizeof (char) + sizeof (int)) >
            (L->l_collate_bufsize - coll_offset))
                return ((ssize_t)-1);

        /*
         * If negative, set sign.
         */
        if (number[i] == '-') {
                i++;
                sign = '0';
        }

        /*
         * Scan integer part; eat leading zeros.
         */
        for (; i < length; i++) {
                if (IS_SEPARATOR(number[i]))
                        continue;

                if (number[i] == '0' && !(state & IN_NUMBER))
                        continue;

                if (!isdigit((uchar_t)number[i]))
                        break;

                state |= IN_NUMBER;
                if (sign == '0')
                        digits[j++] = '0' + '9' - number[i];
                else
                        digits[j++] = number[i];
        }

        if (i < length && IS_DECIMAL(number[i])) {
                /*
                 * Integer part terminated by decimal.
                 */
                digits[j] = DECIMAL_CHAR;
                log_ten = j++;

                /*
                 * Scan fractional part.
                 */
                for (++i; i < length; i++) {
                        if (IS_SEPARATOR(number[i]))
                                continue;

                        if (!isdigit((uchar_t)number[i]))
                                break;

                        if (number[i] != '0')
                                state |= IN_NUMBER;

                        if (sign == '0')
                                digits[j++] = '0' + '9' - number[i];
                        else
                                digits[j++] = number[i];
                }

                if (sign == '0')
                        digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
        } else {
                /*
                 * Nondigit or end of string seen.
                 */
                log_ten = (int)j;
                if (sign == '0')
                        digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
                else
                        digits[j] = INTERFIELD_SEPARATOR;
        }

        if ((state & IN_NUMBER) == 0) {
                /*
                 * A non-zero number was not detected; treat as defined zero.
                 */
                sign = '1';
                log_ten = 0;
                digits[0] = '0';
                j = 1;
        }

        /*
         * We subtract a constant from the log of negative values so that
         * they will correctly precede positive values with a zero logarithm.
         */
        if (sign == '0') {
                if (j != 0)
                        log_ten = -log_ten - 2;
                else
                        /*
                         * Special case for -0.
                         */
                        log_ten = -1;
        }

        buffer[0] = sign;

        /*
         * Place logarithm in big-endian form.
         */
        for (i = 0; i < sizeof (int); i++)
                buffer[i + 1] = (log_ten << (i * NBBY))
                    >> ((sizeof (int) - 1) * NBBY);

        if (j + sizeof (char) + sizeof (int) <
            L->l_collate_bufsize - coll_offset)
                return (j + 1 + sizeof (int));
        else
                return ((ssize_t)-1);
}

/*ARGSUSED*/
ssize_t
field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
    ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
{
        wchar_t *number;
        wchar_t *buffer = L->l_collate.wp + coll_offset;
        char *lbuffer;
        ssize_t length;

        wchar_t sign = L'2';
        int log_ten;
        wchar_t *digits = buffer + 1 + sizeof (int)/sizeof (wchar_t);
        size_t j = 0;
        size_t i;

        int state = BEFORE_NUMBER;

        number = L->l_data.wp + data_offset;
        length = data_length;

        for (i = 0; i < length; i++)
                if (!W_IS_BLANK(number[i]))
                        break;

        if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) +
            sizeof (int)) > (L->l_collate_bufsize - coll_offset))
                return ((ssize_t)-1);

        if (number[i] == L'-') {
                i++;
                sign = L'0';
        }

        for (; i < length; i++) {
                if (W_IS_SEPARATOR(number[i]))
                        continue;

                if (number[i] == L'0' && !(state & IN_NUMBER))
                        continue;

                if (!iswdigit(number[i]))
                        break;

                state |= IN_NUMBER;
                if (sign == L'0')
                        digits[j++] = L'0' + L'9' - number[i];
                else
                        digits[j++] = number[i];
        }

        if (i < length && W_IS_DECIMAL(number[i])) {
                digits[j] = W_DECIMAL_CHAR;
                log_ten = j++;

                for (++i; i < length; i++) {
                        if (W_IS_SEPARATOR(number[i]))
                                continue;

                        if (!iswdigit(number[i]))
                                break;

                        if (number[i] != L'0')
                                state |= IN_NUMBER;

                        if (sign == L'0')
                                digits[j++] = L'0' + L'9' - number[i];
                        else
                                digits[j++] = number[i];
                }

                if (sign == L'0')
                        digits[j++] = (wchar_t)(WCHAR_MAX -
                            W_INTERFIELD_SEPARATOR);
        } else {
                log_ten = (int)j;
                if (sign == L'0')
                        digits[j++] = (wchar_t)(WCHAR_MAX -
                            W_INTERFIELD_SEPARATOR);
                else
                        digits[j] = W_INTERFIELD_SEPARATOR;
        }

        if ((state & IN_NUMBER) == 0) {
                sign = L'1';
                log_ten = 0;
                digits[0] = L'0';
                j = 1;
        }

        if (sign == L'0') {
                if (j != 0)
                        log_ten = -log_ten - 2;
                else
                        log_ten = -1;
        }

        buffer[0] = sign;
        /*
         * Place logarithm in big-endian form.
         */
        lbuffer = (char *)(buffer + 1);
        for (i = 0; i < sizeof (int); i++)
                lbuffer[i] = (log_ten << (i * NBBY))
                    >> ((sizeof (int) - 1) * NBBY);

        if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) <
            L->l_collate_bufsize - coll_offset * sizeof (wchar_t))
                return (j + 1 + sizeof (int) / sizeof (wchar_t));
        else
                return ((ssize_t)-1);
}

/*
 * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour
 * when coll_offset exceeds l_collate_bufsize.
 */
ssize_t
field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator)
{
        ssize_t coll_offset = 0;
        ssize_t start, end, distance;
        field_t *cur_fieldp = F;

        while (cur_fieldp != NULL) {
                /*
                 * delimit field
                 */
                if (!field_separator.sc)
                        field_delimit(cur_fieldp, L, &start, &end);
                else
                        field_delimit_tabbed(cur_fieldp, L, &start, &end,
                            field_separator);

                distance = 0;
                if (end - start > 0 ||
                    (end - start == 0 && F->f_species == NUMERIC)) {
                        /*
                         * Convert field, appending to collated field of line
                         * record.
                         */
                        distance = cur_fieldp->f_convert(cur_fieldp, L,
                            field_separator, start, end - start, coll_offset);

                        /*
                         * branch should execute comparatively rarely
                         */
                        if (distance == -1) {
                                if (flags & FCV_REALLOC) {
                                        ASSERT(L->l_collate_bufsize > 0);
                                        L->l_collate_bufsize *= 2;
                                        L->l_collate.sp =
                                            safe_realloc(L->l_collate.sp,
                                            L->l_collate_bufsize);

                                        __S(stats_incr_convert_reallocs());
                                        continue;
                                } else {
                                        /*
                                         * FCV_FAIL has been set.
                                         */
                                        return (-1);
                                }
                        }
                }

                if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
                        xstrninv(L->l_collate.sp, coll_offset, distance);
                        *(L->l_collate.sp + coll_offset + distance) =
                            (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
                        distance++;
                }

                ASSERT(distance >= 0);
                coll_offset += distance;
                if (coll_offset >= L->l_collate_bufsize) {
                        if (flags & FCV_REALLOC) {
                                ASSERT(L->l_collate_bufsize > 0);
                                L->l_collate_bufsize *= 2;
                                L->l_collate.sp = safe_realloc(L->l_collate.sp,
                                    L->l_collate_bufsize);

                                __S(stats_incr_convert_reallocs());
                        } else {
                                return (-1);
                        }
                }
                *(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR;
                coll_offset++;

                cur_fieldp = cur_fieldp->f_next;
        }

        L->l_collate_length = coll_offset;

        return (L->l_collate_length);
}

ssize_t
field_convert_wide(field_t *F, line_rec_t *L, int flags,
    vchar_t field_separator)
{
        ssize_t coll_offset = 0;
        ssize_t start, end, distance;
        field_t *cur_fieldp = F;

        while (cur_fieldp != NULL) {
                if (!field_separator.wc)
                        field_delimit_wide(cur_fieldp, L, &start, &end);
                else
                        field_delimit_tabbed_wide(cur_fieldp, L, &start, &end,
                            field_separator);

                distance = 0;
                if (end - start > 0 ||
                    end - start == 0 && F->f_species == NUMERIC) {
                        distance = cur_fieldp->f_convert(cur_fieldp, L,
                            field_separator, start, end - start, coll_offset);

                        if (distance == -1) {
                                if (flags & FCV_REALLOC) {
                                        ASSERT(L->l_collate_bufsize > 0);
                                        L->l_collate_bufsize *= 2;
                                        L->l_collate.wp = safe_realloc(
                                            L->l_collate.wp,
                                            L->l_collate_bufsize);

                                        __S(stats_incr_convert_reallocs());
                                        continue;
                                } else {
                                        return (-1);
                                }
                        }
                }

                if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
                        xwcsninv(L->l_collate.wp, coll_offset, distance);
                        *(L->l_collate.wp + coll_offset + distance) =
                            WCHAR_MAX - INTERFIELD_SEPARATOR;
                        distance++;
                }

                ASSERT(distance >= 0);
                coll_offset += distance;
                if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) {
                        if (flags & FCV_REALLOC) {
                                ASSERT(L->l_collate_bufsize > 0);
                                L->l_collate_bufsize *= 2;
                                L->l_collate.wp = safe_realloc(L->l_collate.wp,
                                    L->l_collate_bufsize);

                                __S(stats_incr_convert_reallocs());
                        } else {
                                return (-1);
                        }
                }
                *(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR;
                coll_offset++;

                cur_fieldp = cur_fieldp->f_next;
        }

        L->l_collate_length = coll_offset * sizeof (wchar_t);
#ifdef _LITTLE_ENDIAN
        xwcsntomsb(L->l_collate.wp, coll_offset);
#endif /* _LITTLE_ENDIAN */

        return (L->l_collate_length);
}

/*
 * line_convert() and line_convert_wide() are called when the collation vector
 * of a given line has been exhausted, and we are performing the final,
 * full-line comparison required by the sort specification.  Because we do not
 * have a guarantee that l_data is null-terminated, we create an explicitly
 * null-terminated copy suitable for transformation to a collatable form for the
 * current locale.
 */
static void
line_convert(line_rec_t *L)
{
        static ssize_t bufsize;
        static char *buffer;

        if (L->l_raw_collate.sp != NULL)
                return;

        if (L->l_data_length + 1 > bufsize) {
                buffer = safe_realloc(buffer, L->l_data_length + 1);
                bufsize = L->l_data_length + 1;
        }

        (void) strncpy(buffer, L->l_data.sp, L->l_data_length);
        buffer[L->l_data_length] = '\0';

        L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp,
            xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
        xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer,
            xfrm_ops->sx_len(buffer, L->l_data_length) + 1);

        __S(stats_incr_line_conversions());
}

static void
line_convert_wide(line_rec_t *L)
{
        static wchar_t *buffer;
        static ssize_t bufsize;

        ssize_t dlength;

        if (L->l_raw_collate.wp != NULL)
                return;

        if (L->l_data_length + 1 > bufsize) {
                buffer = safe_realloc(buffer, (L->l_data_length + 1) *
                    sizeof (wchar_t));
                bufsize = L->l_data_length + 1;
        }

        (void) wcsncpy(buffer, L->l_data.wp, L->l_data_length);
        buffer[L->l_data_length] = L'\0';

        dlength = wcsxfrm(NULL, buffer, 0) + 1;
        L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength *
            sizeof (wchar_t));
        (void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength);

        __S(stats_incr_line_conversions());
}

/*
 * Our convention for collation is
 *
 *      A > B  => r > 0,
 *      A == B => r = 0,
 *      A < B  => r < 0
 *
 * This convention is consistent with the definition of memcmp(), strcmp(), and
 * strncmp() in the C locale.  collated() and collated_wide() have two optional
 * behaviours, which can be activated by setting the appropriate values in
 * coll_flag:  COLL_UNIQUE, which returns 0 if the l_collate fields of the line
 * records being compared are identical; COLL_DATA_ONLY, which ignores the
 * l_collate field for the current comparison; and COLL_REVERSE, which flips the
 * result for comparisons that fall through to an actual data comparison (since
 * the collated vector should already reflect reverse ordering from field
 * conversion).
 */
int
collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
{
        ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
        int r;
        int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
            INT_SIGN_PASS_MASK;
        ssize_t la, lb;

        if (!(coll_flag & COLL_DATA_ONLY)) {
                if (ml > 0) {
                        r = memcmp(A->l_collate.sp + depth,
                            B->l_collate.sp + depth, ml);

                        if (r)
                                return (r);
                }

                if (A->l_collate_length < B->l_collate_length)
                        return (-1);

                if (A->l_collate_length > B->l_collate_length)
                        return (1);
        }

        /*
         * This is where we cut out, if we know that the current sort is over
         * the entire line.
         */
        if (coll_flag & COLL_UNIQUE)
                return (0);

        line_convert(A);
        line_convert(B);

        la = strlen(A->l_raw_collate.sp);
        lb = strlen(B->l_raw_collate.sp);

        r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb));

        if (r)
                return (r ^ mask);

        if (la < lb)
                return (-1 ^ mask);

        if (la > lb)
                return (1 ^ mask);

        return (0);
}

int
collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
{
        ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
        int r;
        int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
            INT_SIGN_PASS_MASK;
        ssize_t la, lb;

        if (!(coll_flag & COLL_DATA_ONLY)) {
                if (ml > 0) {
                        r = memcmp(A->l_collate.sp + depth,
                            B->l_collate.sp + depth, ml);

                        if (r)
                                return (r);
                }
                if (A->l_collate_length < B->l_collate_length)
                        return (-1);

                if (A->l_collate_length > B->l_collate_length)
                        return (1);
        }

        if (coll_flag & COLL_UNIQUE)
                return (0);

        line_convert_wide(A);
        line_convert_wide(B);

        la = wcslen(A->l_raw_collate.wp);
        lb = wcslen(B->l_raw_collate.wp);

        r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp,
            (size_t)MIN(la, lb));

        if (r)
                return (r ^ mask);

        if (la < lb)
                return (-1 ^ mask);

        if (la > lb)
                return (1 ^ mask);

        return (0);
}