root/usr/src/lib/iconv_modules/utf-8/common/common_defs.h
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#ifndef COMMON_DEFS_H
#define COMMON_DEFS_H



#define MAGIC_NUMBER                    201513


/* ISO/IEC 10646-1/Unicode Byte Order Mark */
#define ICV_BOM_IN_BIG_ENDIAN           0x00feff
#define ICV_BOM_IN_LITTLE_ENDIAN_UCS4   0xfffe0000
#if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \
        defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
#define ICV_BOM_IN_LITTLE_ENDIAN        0x00fffe
#else
#define ICV_BOM_IN_LITTLE_ENDIAN        0xfffe0000
#endif


/*
 * Following type macros are for possible error cases that can be defined for
 * mapping tables. Valid characters will have the byte length which will be
 * always a positive integer.
 */
#define ICV_TYPE_NON_IDENTICAL_CHAR     (-1)
#define ICV_TYPE_ILLEGAL_CHAR           (-2)

/* Following are replacement characters for non-identical character cases. */
#define ICV_CHAR_ASCII_REPLACEMENT      ('?')
#define ICV_CHAR_UTF8_REPLACEMENT       (0x00efbfbd)
#define ICV_CHAR_UCS2_REPLACEMENT       (0xfffd)


typedef enum { false = 0, true = 1 } boolean;


/* We only support characters in range of UTF-16. */
typedef struct {
        unsigned int    u8;
        signed char     size;
} to_utf8_table_component_t;

typedef struct {
        unsigned int    u8;
        unsigned char   sb;
} to_sb_table_component_t;


/* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */
typedef struct {
        boolean         bom_written;
        boolean         little_endian;
} ucs_state_t;

typedef struct {
        ucs_state_t     input;
        ucs_state_t     output;
} ucs_ucs_state_t;


/* UTF-7 requires additional state data fields. */
typedef struct {
        boolean         bom_written;
        boolean         little_endian;
        boolean         in_the_middle_of_utf7_sequence;
        unsigned int    remnant;
        signed char     remnant_count;          /* in bits */
        unsigned char   prevch;
} utf7_state_t;


/*
 * Following vector shows the number of bytes in a UTF-8 character.
 * Index will be the first byte of the character.
 */

#define IL_                             ICV_TYPE_ILLEGAL_CHAR

static const char number_of_bytes_in_utf8_char[0x100] = {
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,

    /*  80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
        IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,

    /*  90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
        IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,

    /*  A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
        IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,

    /*  B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
        IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,

    /*  C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
        IL_,IL_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,

    /*  D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,

    /*  E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,

    /*  F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
         4,  4,  4,  4,  4, IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
};

#undef IL_

/*
 * Following is a vector of bit-masks to get used bits in the first byte of
 * a UTF-8 character.  Index is the number of bytes in the UTF-8 character
 * and the index value comes from above table.
 */
static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };

/*
 * The following two vectors are to provide valid minimum and
 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
 * better illegal sequence checking. The index value must be the value of
 * the first byte of the UTF-8 character.
 */
static const unsigned char valid_min_2nd_byte[0x100] = {
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
     /*  C0    C1    C2    C3    C4    C5    C6    C7  */
        0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     /*  C8    C9    CA    CB    CC    CD    CE    CF  */
        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     /*  D0    D1    D2    D3    D4    D5    D6    D7  */
        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     /*  D8    D9    DA    DB    DC    DD    DE    DF  */
        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     /*  E0    E1    E2    E3    E4    E5    E6    E7  */
        0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     /*  E8    E9    EA    EB    EC    ED    EE    EF  */
        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     /*  F0    F1    F2    F3    F4    F5    F6    F7  */
        0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
};

static const unsigned char valid_max_2nd_byte[0x100] = {
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
     /*  C0    C1    C2    C3    C4    C5    C6    C7  */
        0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
     /*  C8    C9    CA    CB    CC    CD    CE    CF  */
        0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
     /*  D0    D1    D2    D3    D4    D5    D6    D7  */
        0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
     /*  D8    D9    DA    DB    DC    DD    DE    DF  */
        0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
     /*  E0    E1    E2    E3    E4    E5    E6    E7  */
        0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
     /*  E8    E9    EA    EB    EC    ED    EE    EF  */
        0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
     /*  F0    F1    F2    F3    F4    F5    F6    F7  */
        0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
        0,    0,    0,    0,    0,    0,    0,    0,
};


/*
 * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8
 * characters' second to sixth bytes.
 */
#define ICV_UTF8_BIT_SHIFT              6
#define ICV_UTF8_BIT_MASK               0x3f
#define ICV_FETCH_UTF8_BOM_SIZE         6

#define ICV_FETCH_UCS4_SIZE             4
#if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \
        defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
#define ICV_FETCH_UCS_SIZE              2
#define ICV_FETCH_UCS_SIZE_TWO          4
#elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
        defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
#define ICV_FETCH_UCS_SIZE              4
#define ICV_FETCH_UCS_SIZE_TWO          8
#endif

/*
 * UTF-8 representations of some useful Unicode values.
 *
 * The U+FFFE in UTF-8 is 0x00efbfbe and the U+FFFF is 0x00efbfbf but
 * we use masked values at the below:
 */
#define ICV_UTF8_REPRESENTATION_d800            (0x00eda080UL)
#define ICV_UTF8_REPRESENTATION_dfff            (0x00edbfbfUL)
#define ICV_UTF8_REPRESENTATION_fdd0            (0x00efb790UL)
#define ICV_UTF8_REPRESENTATION_fdef            (0x00efb7afUL)

#define ICV_UTF8_REPRESENTATION_fffe            (0x000fbfbeUL)
#define ICV_UTF8_REPRESENTATION_ffff            (0x000fbfbfUL)
#define ICV_UTF8_REPRESENTATION_ffff_mask       (0x000fffffUL)

#define ICV_UTF8_REPRESENTATION_10fffd          (0xf48fbfbdUL)

/*
 * UTF-32 and UCS-4 representations of some useful Unicode values for
 * non-character and out of bound invalid character detection.
 */
#define ICV_UTF32_NONCHAR_fffe                  (0xfffeU)
#define ICV_UTF32_NONCHAR_ffff                  (0xffffU)
#define ICV_UTF32_NONCHAR_mask                  (0xffffU)

#define ICV_UTF32_SURROGATE_START_d800          (0xd800U)
#define ICV_UTF32_SURROGATE_END_dfff            (0xdfffU)

#define ICV_UTF32_ARABIC_NONCHAR_START_fdd0     (0xfdd0U)
#define ICV_UTF32_ARABIC_NONCHAR_END_fdef       (0xfdefU)

#define ICV_UTF32_LAST_VALID_CHAR               (0x10fffdU)

#define ICV_UCS4_LAST_VALID_CHAR                (0x7fffffff)


#endif  /* COMMON_DEFS_H */