root/regress/lib/libcrypto/utf8/utf8test.c
/*      $OpenBSD: utf8test.c,v 1.5 2022/11/26 16:08:56 tb Exp $ */
/*
 * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

/*
 * A mostly exhaustive test of UTF-8 decoder and encoder
 */

#include <stdio.h>
#include <string.h>
#include <err.h>

#include <openssl/asn1.h>
#include "asn1_local.h"         /* peek into the internals */

#define UNCHANGED       0xfedcba98

#define ASSERT(x)                                               \
        do {                                                    \
                if (!(x))                                       \
                        errx(1, "test failed at line %d: %s",   \
                            __LINE__, #x);                      \
        } while (0)

int
main(void)
{
        unsigned char testbuf[] = "012345";
        const unsigned char zerobuf[sizeof testbuf] = { 0 };
        unsigned long value;
        unsigned int i, j, k, l;
        int ret;

        /*
         * First, verify UTF8_getc()
         */
        value = UNCHANGED;
        ret = UTF8_getc(testbuf, 0, &value);
        ASSERT(ret == 0);
        ASSERT(value == UNCHANGED);

        /* check all valid single-byte chars */
        for (i = 0; i < 0x80; i++) {
                testbuf[0] = i;
                ret = UTF8_getc(testbuf, 1, &value);
                ASSERT(ret == 1);
                ASSERT(value == i);

                ret = UTF8_getc(testbuf, 2, &value);
                ASSERT(ret == 1);
                ASSERT(value == i);
        }

        /*
         * Verify failure on all invalid initial bytes:
         *      0x80 - 0xBF     following bytes only
         *      0xC0 - 0xC1     used to be in non-shortest forms
         *      0xF5 - 0xFD     used to be initial for 5 and 6 byte sequences
         *      0xFE - 0xFF     have never been valid in utf-8
         */
        for (i = 0x80; i < 0xC2; i++) {
                value = UNCHANGED;
                testbuf[0] = i;
                ret = UTF8_getc(testbuf, 1, &value);
                ASSERT(ret == -2);
                ASSERT(value == UNCHANGED);
        }
        for (i = 0xF5; i < 0x100; i++) {
                value = UNCHANGED;
                testbuf[0] = i;
                ret = UTF8_getc(testbuf, 1, &value);
                ASSERT(ret == -2);
                ASSERT(value == UNCHANGED);
        }

        /*
         * Verify handling of all two-byte sequences
         */
        for (i = 0xC2; i < 0xE0; i++) {
                testbuf[0] = i;

                for (j = 0; j < 0x100; j++) {
                        testbuf[1] = j;

                        value = UNCHANGED;
                        ret = UTF8_getc(testbuf, 1, &value);
                        ASSERT(ret == -1);
                        ASSERT(value == UNCHANGED);

                        ret = UTF8_getc(testbuf, 2, &value);

                        /* outside range of trailing bytes */
                        if (j < 0x80 || j > 0xBF) {
                                ASSERT(ret == -3);
                                ASSERT(value == UNCHANGED);
                                continue;
                        }

                        /* valid */
                        ASSERT(ret == 2);
                        ASSERT((value & 0x3F) == (j & 0x3F));
                        ASSERT(value >> 6 == (i & 0x1F));
                }
        }

        /*
         * Verify handling of all three-byte sequences
         */
        for (i = 0xE0; i < 0xF0; i++) {
                testbuf[0] = i;

                for (j = 0; j < 0x100; j++) {
                        testbuf[1] = j;

                        for (k = 0; k < 0x100; k++) {
                                testbuf[2] = k;

                                value = UNCHANGED;
                                ret = UTF8_getc(testbuf, 2, &value);
                                ASSERT(ret == -1);
                                ASSERT(value == UNCHANGED);

                                ret = UTF8_getc(testbuf, 3, &value);

                                /* outside range of trailing bytes */
                                if (j < 0x80 || j > 0xBF ||
                                    k < 0x80 || k > 0xBF) {
                                        ASSERT(ret == -3);
                                        ASSERT(value == UNCHANGED);
                                        continue;
                                }

                                /* non-shortest form */
                                if (i == 0xE0 && j < 0xA0) {
                                        ASSERT(ret == -4);
                                        ASSERT(value == UNCHANGED);
                                        continue;
                                }

                                /* surrogate pair code point */
                                if (i == 0xED && j > 0x9F) {
                                        ASSERT(ret == -2);
                                        ASSERT(value == UNCHANGED);
                                        continue;
                                }

                                ASSERT(ret == 3);
                                ASSERT((value & 0x3F) == (k & 0x3F));
                                ASSERT(((value >> 6) & 0x3F) == (j & 0x3F));
                                ASSERT(value >> 12 == (i & 0x0F));
                        }
                }
        }

        /*
         * Verify handling of all four-byte sequences
         */
        for (i = 0xF0; i < 0xF5; i++) {
                testbuf[0] = i;

                for (j = 0; j < 0x100; j++) {
                        testbuf[1] = j;

                        for (k = 0; k < 0x100; k++) {
                                testbuf[2] = k;

                                for (l = 0; l < 0x100; l++) {
                                        testbuf[3] = l;

                                        value = UNCHANGED;
                                        ret = UTF8_getc(testbuf, 3, &value);
                                        ASSERT(ret == -1);
                                        ASSERT(value == UNCHANGED);

                                        ret = UTF8_getc(testbuf, 4, &value);

                                        /* outside range of trailing bytes */
                                        if (j < 0x80 || j > 0xBF ||
                                            k < 0x80 || k > 0xBF ||
                                            l < 0x80 || l > 0xBF) {
                                                ASSERT(ret == -3);
                                                ASSERT(value == UNCHANGED);
                                                continue;
                                        }

                                        /* non-shortest form */
                                        if (i == 0xF0 && j < 0x90) {
                                                ASSERT(ret == -4);
                                                ASSERT(value == UNCHANGED);
                                                continue;
                                        }

                                        /* beyond end of UCS range */
                                        if (i == 0xF4 && j > 0x8F) {
                                                ASSERT(ret == -2);
                                                ASSERT(value == UNCHANGED);
                                                continue;
                                        }

                                        ASSERT(ret == 4);
                                        ASSERT((value & 0x3F) == (l & 0x3F));
                                        ASSERT(((value >> 6) & 0x3F) ==
                                                          (k & 0x3F));
                                        ASSERT(((value >> 12) & 0x3F) ==
                                                           (j & 0x3F));
                                        ASSERT(value >> 18 == (i & 0x07));
                                }
                        }
                }
        }


        /*
         * Next, verify UTF8_putc()
         */
        memset(testbuf, 0, sizeof testbuf);

        /* single-byte sequences */
        for (i = 0; i < 0x80; i++) {
                ret = UTF8_putc(NULL, 0, i);
                ASSERT(ret == 1);

                testbuf[0] = 0;
                ret = UTF8_putc(testbuf, 0, i);
                ASSERT(ret == -1);
                ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);

                ret = UTF8_putc(testbuf, 1, i);
                ASSERT(ret == 1);
                ASSERT(testbuf[0] == i);
                ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0);
        }

        /* two-byte sequences */
        for (i = 0x80; i < 0x800; i++) {
                ret = UTF8_putc(NULL, 0, i);
                ASSERT(ret == 2);

                testbuf[0] = testbuf[1] = 0;
                ret = UTF8_putc(testbuf, 1, i);
                ASSERT(ret == -1);
                ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);

                ret = UTF8_putc(testbuf, 2, i);
                ASSERT(ret == 2);
                ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0);
                ret = UTF8_getc(testbuf, 2, &value);
                ASSERT(ret == 2);
                ASSERT(value == i);
        }

        /* three-byte sequences */
        for (i = 0x800; i < 0x10000; i++) {
                if (i >= 0xD800 && i < 0xE000) {
                        /* surrogates aren't valid */
                        ret = UTF8_putc(NULL, 0, i);
                        ASSERT(ret == -2);
                        continue;
                }

                ret = UTF8_putc(NULL, 0, i);
                ASSERT(ret == 3);

                testbuf[0] = testbuf[1] = testbuf[2] = 0;
                ret = UTF8_putc(testbuf, 2, i);
                ASSERT(ret == -1);
                ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);

                ret = UTF8_putc(testbuf, 3, i);
                ASSERT(ret == 3);
                ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0);
                ret = UTF8_getc(testbuf, 3, &value);
                ASSERT(ret == 3);
                ASSERT(value == i);
        }

        /* four-byte sequences */
        for (i = 0x10000; i < 0x110000; i++) {
                ret = UTF8_putc(NULL, 0, i);
                ASSERT(ret == 4);

                testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0;
                ret = UTF8_putc(testbuf, 3, i);
                ASSERT(ret == -1);
                ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);

                ret = UTF8_putc(testbuf, 4, i);
                ASSERT(ret == 4);
                ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0);
                ret = UTF8_getc(testbuf, 4, &value);
                ASSERT(ret == 4);
                ASSERT(value == i);
        }

        /* spot check some larger values to confirm error return */
        for (i = 0x110000; i < 0x110100; i++) {
                ret = UTF8_putc(NULL, 0, i);
                ASSERT(ret == -2);
        }
        for (value = (unsigned long)-1; value > (unsigned long)-256; value--) {
                ret = UTF8_putc(NULL, 0, value);
                ASSERT(ret == -2);
        }

        return 0;
}