root/usr/src/lib/libc/port/locale/localeimpl.c
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
 * Copyright 2025 Oxide Computer Company
 * Copyright 2026 Bill Sommerfeld <sommerfeld@hamachi.org>
 */

/*
 * This file implements the 2008 newlocale and friends handling.
 */

#ifndef _LCONV_C99
#define _LCONV_C99
#endif

#include "lint.h"
#include <atomic.h>
#include <locale.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <errno.h>
#include <string.h>
#include "libc.h"
#include "mtlib.h"
#include "tsd.h"
#include "localeimpl.h"
#include "lctype.h"

/*
 * Big Theory of Locales:
 *
 * (It is recommended that readers familiarize themselves with the POSIX
 * 2008 (XPG Issue 7) specifications for locales, first.)
 *
 * Historically, we had a bunch of global variables that stored locale
 * data.  While this worked well, it limited applications to a single locale
 * at a time.  This doesn't work well in certain server applications.
 *
 * Issue 7, X/Open introduced the concept of a locale_t object, along with
 * versions of functions that can take this object as a parameter, along
 * with functions to clone and manipulate these locale objects.  The new
 * functions are named with a _l() suffix.
 *
 * Additionally uselocale() is introduced which can change the locale of
 * of a single thread.  However, setlocale() can still be used to change
 * the global locale.
 *
 * In our implementation, we use libc's TSD to store the locale data that
 * was previously global.  We still have global data because some applications
 * have had those global objects compiled into them.  (Such applications will
 * be unable to benefit from uselocale(), btw.)  The legacy routines are
 * reimplemented as wrappers that use the appropriate locale object by
 * calling uselocale().  uselocale() when passed a NULL pointer returns the
 * thread-specific locale object if one is present, or the global locale
 * object otherwise.  Note that once the TSD data is set, the only way
 * to revert to the global locale is to pass the global locale LC_GLOBAL_LOCALE
 * to uselocale().
 *
 * We are careful to minimize performance impact of multiple calls to
 * uselocale() or setlocale() by using a cache of locale data whenever possible.
 * As a consequence of this, applications that iterate over all possible
 * locales will burn through a lot of virtual memory, but we find such
 * applications rare.  (locale -a might be an exception, but it is short lived.)
 *
 * Category data is never released (although enclosing locale objects might be),
 * in order to guarantee thread-safety.  Calling freelocale() on an object
 * while it is in use by another thread is a programmer error (use-after-free)
 * and we don't bother to note it further.
 *
 * Locale objects (global locales) established by setlocale() are also
 * never freed (for MT safety), but we will save previous locale objects
 * and reuse them when we can.
 */

typedef struct locdata *(*loadfn_t)(const char *);

static const loadfn_t loaders[LC_ALL] = {
        __lc_ctype_load,
        __lc_numeric_load,
        __lc_time_load,
        __lc_collate_load,
        __lc_monetary_load,
        __lc_messages_load,
};

extern struct lc_monetary lc_monetary_posix;
extern struct lc_numeric lc_numeric_posix;
extern struct lc_messages lc_messages_posix;
extern struct lc_time lc_time_posix;
extern struct lc_ctype lc_ctype_posix;
extern struct lc_collate lc_collate_posix;
extern struct _RuneLocale _DefaultRuneLocale;

static struct _locale posix_locale = {
        /* locdata */
        .locdata = {
                &__posix_ctype_locdata,
                &__posix_numeric_locdata,
                &__posix_time_locdata,
                &__posix_collate_locdata,
                &__posix_monetary_locdata,
                &__posix_messages_locdata,
        },
        .locname = "C",
        .ctype = &lc_ctype_posix,
        .numeric = &lc_numeric_posix,
        .collate = &lc_collate_posix,
        .monetary = &lc_monetary_posix,
        .messages = &lc_messages_posix,
        .time = &lc_time_posix,
        .runelocale = &_DefaultRuneLocale,
};

locale_t ___global_locale = &posix_locale;

locale_t
__global_locale(void)
{
        return (___global_locale);
}

/*
 * Locale data for hybrid C.UTF-8 locale having all the characteristics of
 * default C/POSIX locale, except for LC_CTYPE data which is retrieved from
 * cache/file as for other UTF-8 locales.
 */
static struct locdata cutf_locdata[LC_ALL] = {
        { "C.UTF-8", NULL }, /* unused */
        { "C.UTF-8", &lc_numeric_posix },
        { "C.UTF-8", &lc_time_posix },
        { "C.UTF-8", &lc_collate_posix },
        { "C.UTF-8", &lc_monetary_posix },
        { "C.UTF-8", &lc_messages_posix },
};

/*
 * Category names for getenv()  Note that this was modified
 * for Solaris.  See <iso/locale_iso.h>.
 */
#define NUM_CATS        7
static char *categories[7] = {
        "LC_CTYPE",
        "LC_NUMERIC",
        "LC_TIME",
        "LC_COLLATE",
        "LC_MONETARY",
        "LC_MESSAGES",
        "LC_ALL",
};

/*
 * Prototypes.
 */
static const char *get_locale_env(int);
static struct locdata *locdata_get(int, const char *);
static struct locdata *locdata_get_cache(int, const char *);
static locale_t mklocname(locale_t);

/*
 * Some utility routines.
 */

struct locdata *
__locdata_alloc(const char *name, size_t memsz)
{
        struct locdata *ldata;

        if ((ldata = lmalloc(sizeof (*ldata))) == NULL) {
                return (NULL);
        }
        if ((ldata->l_data[0] = libc_malloc(memsz)) == NULL) {
                lfree(ldata, sizeof (*ldata));
                errno = ENOMEM;
                return (NULL);
        }
        (void) strlcpy(ldata->l_lname, name, sizeof (ldata->l_lname));

        return (ldata);
}

/*
 * Normally we never free locale data truly, but if we failed to load it
 * for some reason, this routine is used to cleanup the partial mess.
 */
void
__locdata_free(struct locdata *ldata)
{
        for (int i = 0; i < NLOCDATA; i++)
                libc_free(ldata->l_data[i]);
        if (ldata->l_map != NULL && ldata->l_map_len)
                (void) munmap(ldata->l_map, ldata->l_map_len);
        lfree(ldata, sizeof (*ldata));
}

/*
 * It turns out that for performance reasons we would really like to
 * cache the most recently referenced locale data to avoid wasteful
 * loading from files.
 */

static struct locdata *cache_data[LC_ALL];
static struct locdata *cat_data[LC_ALL];
static mutex_t cache_lock = DEFAULTMUTEX;

/*
 * Returns the cached data if the locale name is the same.  If not,
 * returns NULL (cache miss).  The locdata is returned with a hold on
 * it, taken on behalf of the caller.  The caller should drop the hold
 * when it is finished.
 */
static struct locdata *
locdata_get_cache(int category, const char *locname)
{
        struct locdata *loc;

        if (category < 0 || category >= LC_ALL)
                return (NULL);

        /* Try cache first. */
        lmutex_lock(&cache_lock);
        loc = cache_data[category];

        if ((loc != NULL) && (strcmp(loc->l_lname, locname) == 0)) {
                lmutex_unlock(&cache_lock);
                return (loc);
        }

        /*
         * Failing that try previously loaded locales (linear search) --
         * this could be optimized to a hash, but its unlikely that a single
         * application will ever need to work with more than a few locales.
         */
        for (loc = cat_data[category]; loc != NULL; loc = loc->l_next) {
                if (strcmp(locname, loc->l_lname) == 0) {
                        break;
                }
        }

        /*
         * Finally, if we still don't have one, try loading the locale
         * data from the actual on-disk data.
         *
         * We drop the lock (libc wants to ensure no internal locks
         * are held when we call other routines required to read from
         * files, allocate memory, etc.)  There is a small race here,
         * but the consequences of the race are benign -- if multiple
         * threads hit this at precisely the same point, we could
         * wind up with duplicates of the locale data in the cache.
         *
         * This wastes the memory for an extra copy of the locale
         * data, but there is no further harm beyond that.  Its not
         * worth the effort to recode this to something "safe"
         * (which would require rescanning the list, etc.), given
         * that this race will probably never actually occur.
         */
        if (loc == NULL) {
                lmutex_unlock(&cache_lock);
                loc = (*loaders[category])(locname);
                lmutex_lock(&cache_lock);
                if (loc != NULL)
                        (void) strlcpy(loc->l_lname, locname,
                            sizeof (loc->l_lname));
        }

        /*
         * Assuming we got one, update the cache, and stick us on the list
         * of loaded locale data.  We insert into the head (more recent
         * use is likely to win.)
         */
        if (loc != NULL) {
                cache_data[category] = loc;
                if (!loc->l_cached) {
                        loc->l_cached = 1;
                        loc->l_next = cat_data[category];
                        cat_data[category] = loc;
                }
        }

        lmutex_unlock(&cache_lock);
        return (loc);
}

/* Charmap aliases, mostly found in Linux */
static const struct {
        const char *alias;
        const char *name;
} cmalias[] = {
        { "utf8", "UTF-8" },
        { "iso88591", "ISO8859-1" },
        { "iso885915", "ISO8859-15" },
        { "gb18030", "GB18030" },
        { "koi8r", "KOI8-R" },
        { NULL, NULL }
};

/*
 * Routine to get the locdata for a given category and locale.
 * This includes retrieving it from cache, retrieving it from
 * a file, etc.
 */
static struct locdata *
locdata_get(int category, const char *locname)
{
        char scratch[ENCODING_LEN + 1];
        char scratch2[ENCODING_LEN + 1];
        const char *sep, *cm;
        int cnt;
        int len;
        int i;

        if (locname == NULL || *locname == 0) {
                locname = get_locale_env(category);
        }

        /*
         * Extract the locale name for the category if it is a composite
         * locale.
         */
        if ((sep = strchr(locname, '/')) != NULL) {
                for (cnt = category; cnt && sep != NULL; cnt--) {
                        locname = sep + 1;
                        sep = strchr(locname, '/');
                }
                if (sep) {
                        len = sep - locname + 1;
                        if (len >= sizeof (scratch)) {
                                len = sizeof (scratch);
                        }
                } else {
                        len = sizeof (scratch);
                }
                (void) strlcpy(scratch, locname, len);
                locname = scratch;
        } else if ((sep = strchr(locname, ';')) != NULL) {
                /*
                 * Accept glibc-style composite locale as libstdc++ expects.
                 * glibc names composite locales using a semicolon-
                 * separated list of <category>=<value> assignments.
                 *
                 * Segment the string at semicolons, checking if each segment
                 * starts with the name of the category we're looking for,
                 * followed by an equals sign.  Ignore everything else.
                 */
                const char *catname = categories[category];
                size_t catlen = strlen(catname);
                const char *locnameend = locname + strlen(locname);
                const char *locp = locname; /* start of this entry */
                const char *endp = sep;   /* end of this entry */
                const char *value = NULL; /* start of match */
                const char *endvalue = NULL; /* end of match */
                size_t copylen;

                for (;;) {
                        if (((endp - locp) > catlen) &&
                            (memcmp(locp, catname, catlen) == 0) &&
                            (locp[catlen] == '=')) {
                                value = &locp[catlen + 1];
                                endvalue = endp;
                        }
                        if (endp >= locnameend)
                                break;
                        locp = endp + 1;
                        endp = strchr(locp, ';');
                        if (endp == NULL)
                                endp = locnameend;
                }

                /* match glibc errno */
                if (value == NULL) {
                        errno = EINVAL;
                        return (NULL);
                }
                copylen = endvalue - value;
                /*
                 * As real locale names are all shorter
                 * than this, anything longer is
                 * unparsable garbage.  Return failure.
                 */
                if (copylen > ENCODING_LEN) {
                        errno = ENOENT;
                        return (NULL);
                }
                if (copylen == 0) {
                        /*
                         * glibc falls through to the
                         * environment, so we must too..
                         */
                        locname = get_locale_env(category);
                } else {
                        memcpy(scratch, value, copylen);
                        scratch[copylen] = 0;
                        locname = scratch;
                }
        }

        if ((strcmp(locname, "C") == 0) || (strcmp(locname, "POSIX") == 0))
                return (posix_locale.locdata[category]);

        /* Handle charmap aliases */
        for (i = 0; cmalias[i].alias != NULL; i++) {
                if ((cm = strstr(locname, cmalias[i].alias)) != NULL &&
                    strlen(cm) == strlen(cmalias[i].alias)) {
                        len = cm - locname + 1;
                        if (len + strlen(cmalias[i].name) >= sizeof (scratch2))
                                break;
                        (void) strlcpy(scratch2, locname, len);
                        (void) strlcat(scratch2, cmalias[i].name,
                            sizeof (scratch2));
                        locname = scratch2;
                        break;
                }
        }

        if ((strcmp(locname, "C.UTF-8") == 0) && (category != LC_CTYPE))
                return (&cutf_locdata[category]);

        return (locdata_get_cache(category, locname));
}

/* tsd destructor */
static void
freelocptr(void *arg)
{
        locale_t *locptr = arg;
        if (*locptr != NULL)
                freelocale(*locptr);
}

static const char *
get_locale_env(int category)
{
        const char *env;

        /* 1. check LC_ALL. */
        env = getenv(categories[LC_ALL]);

        /* 2. check LC_* */
        if (env == NULL || *env == '\0')
                env = getenv(categories[category]);

        /* 3. check LANG */
        if (env == NULL || *env == '\0')
                env = getenv("LANG");

        /* 4. if none is set, fall to "C" */
        if (env == NULL || *env == '\0')
                env = "C";

        return (env);
}


/*
 * This routine is exposed via the MB_CUR_MAX macro.  Note that legacy
 * code will continue to use _ctype[520], but we prefer this function as
 * it is the only way to get thread-specific information.
 */
unsigned char
__mb_cur_max_l(locale_t loc)
{
        return (loc->ctype->lc_max_mblen);
}

unsigned char
__mb_cur_max(void)
{
        return (__mb_cur_max_l(uselocale(NULL)));
}

/*
 * Public interfaces.
 */

locale_t
duplocale(locale_t src)
{
        locale_t        loc;
        int             i;

        loc = lmalloc(sizeof (*loc));
        if (loc == NULL) {
                return (NULL);
        }
        if (src == NULL) {
                /* illumos extension: POSIX says LC_GLOBAL_LOCALE here */
                src = ___global_locale;
        }
        for (i = 0; i < LC_ALL; i++) {
                loc->locdata[i] = src->locdata[i];
                loc->loaded[i] = 0;
        }
        loc->collate = loc->locdata[LC_COLLATE]->l_data[0];
        loc->ctype = loc->locdata[LC_CTYPE]->l_data[0];
        loc->runelocale = loc->locdata[LC_CTYPE]->l_data[1];
        loc->messages = loc->locdata[LC_MESSAGES]->l_data[0];
        loc->monetary = loc->locdata[LC_MONETARY]->l_data[0];
        loc->numeric = loc->locdata[LC_NUMERIC]->l_data[0];
        loc->time = loc->locdata[LC_TIME]->l_data[0];
        return (loc);
}

void
freelocale(locale_t loc)
{
        /*
         * We take extra care never to free a saved locale created by
         * setlocale().  This shouldn't be strictly necessary, but a little
         * extra safety doesn't hurt here.
         */
        if ((loc != NULL) && (loc != &posix_locale) && (!loc->on_list))
                lfree(loc, sizeof (*loc));
}

locale_t
newlocale(int catmask, const char *locname, locale_t base)
{
        locale_t loc;
        int i, e;

        if (catmask & ~(LC_ALL_MASK)) {
                errno = EINVAL;
                return (NULL);
        }

        /*
         * Technically passing LC_GLOBAL_LOCALE here is illegal,
         * but we allow it.
         */
        if (base == NULL || base == ___global_locale) {
                loc = duplocale(___global_locale);
        } else {
                loc = duplocale(base);
        }
        if (loc == NULL) {
                return (NULL);
        }

        for (i = 0; i < LC_ALL; i++) {
                struct locdata *ldata;
                loc->loaded[i] = 0;
                if (((1 << i) & catmask) == 0) {
                        /* Default to base locale if not overriding */
                        continue;
                }
                ldata = locdata_get(i, locname);
                if (ldata == NULL) {
                        e = errno;
                        freelocale(loc);
                        errno = e;
                        return (NULL);
                }
                loc->locdata[i] = ldata;
        }
        loc->collate = loc->locdata[LC_COLLATE]->l_data[0];
        loc->ctype = loc->locdata[LC_CTYPE]->l_data[0];
        loc->runelocale = loc->locdata[LC_CTYPE]->l_data[1];
        loc->messages = loc->locdata[LC_MESSAGES]->l_data[0];
        loc->monetary = loc->locdata[LC_MONETARY]->l_data[0];
        loc->numeric = loc->locdata[LC_NUMERIC]->l_data[0];
        loc->time = loc->locdata[LC_TIME]->l_data[0];
        freelocale(base);

        return (mklocname(loc));
}

locale_t
uselocale(locale_t loc)
{
        locale_t lastloc = ___global_locale;
        locale_t *locptr;

        locptr = tsdalloc(_T_SETLOCALE, sizeof (locale_t), freelocptr);
        /* Should never occur */
        if (locptr == NULL) {
                errno = EINVAL;
                return (NULL);
        }

        if (*locptr != NULL)
                lastloc = *locptr;

        /* Argument loc is NULL if we are just querying. */
        if (loc != NULL) {
                /*
                 * Set it to LC_GLOBAL_LOCAL to return to using
                 * the global locale (setlocale).
                 */
                if (loc == ___global_locale) {
                        *locptr = NULL;
                } else {
                        /* No validation of the provided locale at present */
                        *locptr = loc;
                }
        }

        /*
         * The caller is responsible for freeing, of course it would be
         * gross error to call freelocale() on a locale object that is still
         * in use.
         */
        return (lastloc);
}

static locale_t
mklocname(locale_t loc)
{
        int composite = 0;

        /* Look to see if any category is different */
        for (int i = 1; i < LC_ALL; ++i) {
                if (strcmp(loc->locdata[0]->l_lname,
                    loc->locdata[i]->l_lname) != 0) {
                        composite = 1;
                        break;
                }
        }

        if (composite) {
                /*
                 * Note ordering of these follows the numeric order,
                 * if the order is changed, then setlocale() will need
                 * to be changed as well.
                 */
                (void) snprintf(loc->locname, sizeof (loc->locname),
                    "%s/%s/%s/%s/%s/%s",
                    loc->locdata[LC_CTYPE]->l_lname,
                    loc->locdata[LC_NUMERIC]->l_lname,
                    loc->locdata[LC_TIME]->l_lname,
                    loc->locdata[LC_COLLATE]->l_lname,
                    loc->locdata[LC_MONETARY]->l_lname,
                    loc->locdata[LC_MESSAGES]->l_lname);
        } else {
                (void) strlcpy(loc->locname, loc->locdata[LC_CTYPE]->l_lname,
                    sizeof (loc->locname));
        }
        return (loc);
}

/*
 * POSIX has several lifetime requirements that vary on the type of locale.
 *
 * If the locale is LC_GLOBAL_LOCALE, the returned string is required to live
 * beyond the locale's use as the global locale. The specification suggests that
 * this use a thread-local buffer and cautions that it may disappear when the
 * thread terminates or another LC_GLOBAL_LOCALE call is made. In our case,
 * because we will never free a locale that is set with setlocale() (see
 * port/locale/setlocale.c), we can simply return the name of the locale
 * directly.
 *
 * If the locale is any other locale, it is allowed to be invalidated by a call
 * to uselocale() or newlocale().
 *
 * In both of these cases this means that we can simply return the string from
 * the current object. POSIX importantly states that the application is not
 * allowed to assume the name will stay the same across invocations and
 * therefore it cannot be relied upon for serialization. However, it will work
 * with setlocale() again.
 */
const char *
getlocalename_l(int category, locale_t loc)
{
        if (loc == NULL) {
                return (NULL);
        }

        switch (category) {
        case LC_CTYPE:
        case LC_NUMERIC:
        case LC_TIME:
        case LC_COLLATE:
        case LC_MONETARY:
        case LC_MESSAGES:
                return (loc->locdata[category]->l_lname);
        case LC_ALL:
                return (loc->locname);
        default:
                /*
                 * POSIX does not define any errors here so we can't indicate
                 * anything via errno or similar.
                 */
                return (NULL);
        }
}