root/usr/src/lib/iconv_modules/zh/common/zh_TW-big5%zh_TW-iso2022-CN-EXT.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1997, by Sun Microsystems, Inc.
 * All rights reserved.
 */


/*
   Converts From:       Taiwanese BIG5 encoding
   Converts To:         ISO2022-CN-EXT encoding.

   NOTE: This file was created using vi editor with tabstop set to 4.
                 To view this file correctly set tabstop appropriately.
                 e.g. for vi use command        ESC:se ts=4
 */

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include "big5_cns11643.h"      /* Big5 to CNS 11643 mapping table */

#define MSB                     0x80    /* The most significant bit */
#define ONEBYTE         0xff    /* The right most byte */

#define SI              0x0f    /* shift in */
#define SO              0x0e    /* shift out */
#define SS2             0x4e    /* SS2 low byte. High byte is ESC */
#define SS3             0x4f    /* SS3 low byte. High byte is ESC */
#define ESC             0x1b    /* The Escape character */
#define NON_ID_CHAR     '_' /*Substitute this for all unidentified characters*/

/* GET_PLANEC() - Gets the corresponding ISO assigned plane character for
                  the CNS11643 plane */
static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
#define GET_PLANEC(i)   (plane_char[(i)])

typedef struct _icv_state {
        char    keepc[2];       /* Save the recieved bytes here */
        short   cstate;         /* Current state the state machine is in.
                                   These states are C0 or C1*/
        char    ishiftfunc;     /* The currently active shift funtion SI or SO
                                   in the output ISO buffer */
        int     iSOplane;       /* The current CNS11643 plane which is
                                   assigned to the SOdesignation in the output
                                   ISO buffer. Only CNS11643 plane 1 can be
                                   assigned to SOdesignation */
        int     iSS2plane;      /* The current CNS11643 plane which is
                                   assigned to the SS2designation in the output
                                   ISO buffer. Only CNS11643 plane 2 can be
                                   assigned to SS2designation */
        int     iSS3plane;      /* The current CNS11643 plane which is
                                   assigned to the SS3designation in the output
                                   ISO buffer. All CNS11643 planes >= 3 are
                                   assigned to SS3designation */
        size_t  nonidcount; /* Keeps track of skipped input bytes in conversion */
        int     _errno;         /* Internal error number */
} _iconv_st;

enum _CSTATE    { C0, C1 };

static int isbig5(unsigned char*);
static int hascns(char*);
static int ascii_to_iso(char, _iconv_st*, char**, size_t*);
static int big5_to_iso(int, _iconv_st*, char**, size_t*);
static int getcnsbytes(int, char*, int*);
static int binsearch(unsigned long, table_t[], int);


/*
 * _icv_open: Called from iconv_open. Allocates and initializes _iconv_st
 *            structure. Returns pointer to the structure as (void *).
 */


void *
_icv_open()
{
        _iconv_st  *st;

#ifdef DEBUG
        fprintf(stderr, "_icv_open(): Come into!\n");
#endif
        /* Allocate */
        if ((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){
                errno = ENOMEM;
#ifdef DEBUG
        fprintf(stderr, "Error\n");
#endif
                return ((void *) -1);
        }

        /* Initialize */
        st->cstate = C0;
        st->ishiftfunc = SI;
        st->iSOplane = -1;
        st->iSS2plane = -1;
        st->iSS3plane = -1;
        st->nonidcount = 0;
        st->_errno = 0;

#ifdef DEBUG
        fprintf(stderr, "====== _icv_open(): Big5 --> ISO2022-CN-EXT =====\n");
#endif

        /* Return struct */
        return ((void *) st);
}



/*
 * _icv_close: Called from iconv_close(). Frees the _iconv_st structure as
 *             pointed by the argument.
 */

void
_icv_close(_iconv_st *st)
{
        if (st == NULL)
                errno = EBADF;
        else
                free(st);
}

/*
 * _icv_iconv: Called from iconv(). Does the convertion from BIG5 to
 *             ISO2022-CN-EXT.
 */
/*=======================================================
 *
 *   State Machine for interpreting Big-5 code
 *
 *=======================================================
 *
 *                     1st C
 *    +--------> C0 ----------> C1
 *    |    ascii |        2nd C |
 *    ^          v              v
 *    +----<-----+-----<--------+
 *
 *=======================================================*/
size_t
_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
                                char **outbuf, size_t *outbytesleft)
{

        int n, idx;

#ifdef DEBUG
    fprintf(stderr, "=== _icv_iconv(): Big5 --> ISO2022-CN-EXT =====\n");
#endif

        if (st == NULL) {
            errno = EBADF;
            return ((size_t) -1);
        }

        if (inbuf == NULL || *inbuf == NULL ||
            inbytesleft == NULL || *inbytesleft == 0) { /* Reset request */
            if (st->ishiftfunc == SO) {
                if (outbytesleft && *outbytesleft >= 1  && outbuf && *outbuf) {
                    **outbuf = SI;
                    (*outbuf)++;
                    (*outbytesleft)--;
                } else {
                    errno = E2BIG;
                    return((size_t) -1);
                }
            }
            st->cstate = C0;
            st->ishiftfunc = SI;
            st->iSOplane = -1;
            st->iSS2plane = -1;
            st->iSS3plane = -1;
            st->nonidcount = 0;
            st->_errno = 0;
            return ((size_t) 0);
        }

        st->_errno = 0;
        errno = 0;

        /* Before we use *inbytesleft or *outbytesleft we should confirm that
        inbytesleft and outbytesleft are non-NULL. I am considering inbytesleft
        or *inbytesleft having 0 value as a reset request. I am considering
        outbytesleft having 0 value as no space in output buffer. Also, here
        itself I am verifying that outbuf and *outbuf should be non-NULL pointers
        so I do not have to worry about them being NULL below in the conversion
        sub-routines. I also confirm here that *outbytesleft should be > 0 before
        we can continue further */

        if (outbytesleft == NULL || *outbytesleft == 0 ||
                outbuf == NULL || *outbuf == NULL){
            errno = E2BIG;
            return ((size_t)-1);
        }

        /* A state machine for interpreting Big-5 code */
        while (*inbytesleft > 0 && *outbytesleft > 0) {
            switch (st->cstate) {
            case C0:
                if (**inbuf & MSB) { /* May have got the first byte ofa BIG5 code */

                    st->keepc[0] = **inbuf;             /*Save byte */
                    st->cstate = C1;    /* Go to the next state where
                                           the next BIG5 byte is recieved */
                    st->nonidcount += 1;/* Until we have verified that this and
                                           the next byte make a valid BIG5 code
                                           we shall consider this as an
                                           unidentified byte */
                } else if (**inbuf == ESC || **inbuf == SI || **inbuf == SO){

                    /* We should not process these ASCII control codes as these
                       have special significance in the output ISO encoding.
                       Instead we will output NON_ID_CHAR and continue processing */

                    n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
                    if (n < 0) /* Insufficient space in the outbuf */
                            return ((size_t)-1); /* The errno etc. are set in ascii_to_iso */
                    st->nonidcount += 1;
                } else { /* Got ASCII code */
                    n = ascii_to_iso(**inbuf, st, outbuf, outbytesleft);
                    if (n < 0) /* Insufficient space in the outbuf */
                        return ((size_t)-1);
                }
                break;

            case C1:
                st->keepc[1] = (**inbuf);
                if (isbig5((unsigned char*) st->keepc) == 0) {
                    if ((idx = hascns(st->keepc)) >= 0){
                        n = big5_to_iso(idx, st, outbuf, outbytesleft);
                        if (n < 0) /* Insufficient space in the outbuf */
                            return ((size_t)-1);
                        st->nonidcount -= 1; /* The first byte of this big5 saved in
                                                state C0 is confirmed valid BIG5 High
                                                byte and is processed correctly */

                    } else { /* Valid BIG5 but has no CNS encoding */
                        /* We will output the NON_ID_CHAR character */
                        n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
                        if (n < 0) /* Insufficient space in the outbuf */
                            return ((size_t)-1);
                        n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
                        if (n < 0) /* Insufficient space in the outbuf */
                            return ((size_t)-1);
                        st->nonidcount -= 1; /* Include the 2nd byte also as
                                                    unidentified byte */
                    }
                } else { /* Input character is not BIG5 encoding */
                    st->nonidcount += 1;
                    st->_errno = errno = EILSEQ; /* This will cause the code to
                                                    break out of while loop below
                                                    to return to the caller */

                }
                st->cstate = C0; /* Go to the initial state */
                break;

            default:            /* Should never come here */
                fprintf(stderr,
         "_icv_iconv():Big5-->ISO2022-CN-EXT: Should not have come here\n");
                st->_errno = errno = EILSEQ;
                st->cstate = C0;
                break;

            } /* end switch */

            (*inbuf)++;
            (*inbytesleft)--;

            if (st->_errno)
                    break; /* Break out of while loop */

            if (errno) /* We set st->_errno before we set errno. If errno is set
                                      somewhere else we handle that here */
                return ((size_t)-1);

        } /* end while */

/* We now have to handle the case where we have successfully processed the
   previous input character which exhausted the output buffer. This is handled
   by the while loop. However, since there are more input characters that
   haven't been processed yet, we need to set the errno appropriately and
   return -1. */
        if (*inbytesleft > 0 && *outbytesleft == 0) {
                errno = E2BIG;
                return ((size_t)-1);
        }

        return (*inbytesleft + st->nonidcount);

}


/*
 * Big-5 encoding range:
 *      High byte: 0xA1 - 0xFE                          (94 encoding space)
 *      Low byte:  0x40 - 0x7E, 0xA1 - 0xFE     (157 encoding space)
 *      Plane #1:  0xA140 - 0xC8FE                      (6280 encoding space)
 *      Plane #2:  0xC940 - 0xFEFE                      (8478 encoding space)
 *      Total:     94 * 157 = 14,758            (14758 encoding space)
 */
static int isbig5(unsigned char *twobytes)
{
        if (twobytes[0] >= 0xa1 && twobytes[0] <= 0xfe)
            if ((twobytes[1] >= 0x40 && twobytes[1] <= 0x7e) ||
                                        (twobytes[1] >= 0xa1 && twobytes[1] <= 0xfe))
                return (0);
        return(-1);
}


/*
 * hascns() : checks whether we have a CNS 11643 code for the big5 character
 *                        code. If exists returns the index of the big5 character in the
 *                        big5 to CNS table else returns -1.
 */
static int hascns(char* big5mbchar)
{

        int idx;
        unsigned long big5code;

        big5code = (unsigned long) ((big5mbchar[0] & ONEBYTE) << 8) +
                                                                                (big5mbchar[1] & ONEBYTE);

        idx = binsearch(big5code, big5_cns_tab, MAX_BIG5_NUM);

        return (idx); /* binsearch returns -1 if not found, else index */
}


/* ascii_to_iso() : If required, outputs the SI shift function. Outputs the
 *                                      character. If there is insufficient space in the output
 *                                      buffer, it flags the error and returns -1. On success it
 *                                      returns 0.
 */
static int ascii_to_iso(char c, _iconv_st *st, char **outbuf,
                                                        size_t *outbytesleft)
{
        if (st->ishiftfunc != SI){
            **outbuf = SI;
            (*outbuf)++;
            (*outbytesleft)--;
            st->ishiftfunc = SI;

            if (*outbytesleft < 1){ /* Do we now have space for ASCII character?*/
                    st->_errno = errno = E2BIG;
                    return (-1);
            }
        }

        **outbuf = c;
        (*outbuf)++;
        (*outbytesleft)--;

        /* Each line in ISO is expected to have the character set information
           for the Chinese characters in that line. This facilitates text
           scrollling. Hence, on encountering newline reset designations to
           unknown */
        if (c == '\n'){
            st->iSOplane = -1;
            st->iSS2plane = -1;
            st->iSS3plane = -1;
        }

        return (0);

}



/* big5_to_iso() : Converts the Big5 code, for which the index idx in
 *                                 the big5 to cns table is provided as an argument, to
 *                                 its corresponding ISO2022-CN-EXT code. This may
 *                                 require outputting of SO shift function and/or
 *                                 the designations. In case we do not have sufficient
 *                                 space in the outbuf to to do the convertion we flag error
 *                                 and return -1
 */
static int big5_to_iso(int idx, _iconv_st *st, char **outbuf,
                                                        size_t *outbytesleft)
{

        char cnsbytes[2];
        int cnsplane;
        int ret;

        ret = getcnsbytes(idx, cnsbytes, &cnsplane);
        if (ret < 0){
            /* This means that the cnscode is invalid. Should have been taken
               care of in function hascns() and thus this code should never come
               here. We catch this by the error message below */
            fprintf(stderr,
              "big5_to_iso():Big5->ISO2022-CN-EXT:gencnsbyte() rejected cnscode\n");
            st->_errno = errno = EILSEQ;
            return (0);
        }

        switch (cnsplane) {
        case 1:
            if (st->iSOplane != cnsplane){ /* Is SODESIGNATION set to this plane?*/
                /* Output Escape sequence to set the SODESIGNATION to plane 1 */
                /* Before that check that we have space in outbuf for it */
                if (*outbytesleft < 4){
                        st->_errno = errno = E2BIG;
                        return (-1);
                }

                **outbuf = ESC;
                *(*outbuf+1) = '$';
                *(*outbuf+2) = ')';
                *(*outbuf+3) = GET_PLANEC(cnsplane);
                (*outbuf) += 4;
                (*outbytesleft) -= 4;
                st->iSOplane = cnsplane;
            }

            /* Check the current shift function whether it is SO. If not
               set the SO shift function after confirming that you have
               space for it. */
            if (st->ishiftfunc != SO){
                if (*outbytesleft < 1){
                    st->_errno = errno = E2BIG;
                    return (-1);
                }

                **outbuf = SO;
                (*outbuf)++;
                (*outbytesleft)--;
                st->ishiftfunc = SO;
            }
            break;

        case 2:
            if (st->iSS2plane != cnsplane){ /* Is SS2DESIGNATION set tothis plane ? */
                /* Output escape sequence to set SS2DESIGNATION to plane 2 */
                /* Before that check that we have space in outbuf for it */
                if (*outbytesleft < 4){
                        st->_errno = errno = E2BIG;
                        return (-1);
                }

                **outbuf = ESC;
                *(*outbuf+1) = '$';
                *(*outbuf+2) = '*';
                *(*outbuf+3) = GET_PLANEC(cnsplane);
                (*outbuf) += 4;
                (*outbytesleft) -= 4;
                st->iSS2plane = cnsplane;
            }

            /* Output the SS2 shift function only when we have sufficient space
               for the 2 cns code bytes also */
            if (*outbytesleft < 4){
                st->_errno = errno = E2BIG;
                return (-1);
            }

            **outbuf = ESC;
            *(*outbuf+1) = SS2;
            (*outbuf) += 2;
            (*outbytesleft) -= 2;

            break;

        case 3:
        case 4:
        case 5:
        case 6:
        case 7:
        case 12:
        case 14:
        case 15:
        case 16:
            if (st->iSS3plane != cnsplane){ /* Is SS3DESIGNATION set tothis plane? */
                /* Output escape sequence to set SS3DESIGNATION to cnsplane */
                /* Before that check that we have space in outbuf for it */
                if (*outbytesleft < 4){
                        st->_errno = errno = E2BIG;
                        return (-1);
                }

                **outbuf = ESC;
                *(*outbuf+1) = '$';
                *(*outbuf+2) = '+';
                *(*outbuf+3) = GET_PLANEC(cnsplane);
                (*outbuf) += 4;
                (*outbytesleft) -= 4;
                st->iSS3plane = cnsplane;

            }

            /* Output the SS3 shift function only when we have sufficient space
               for the 2 cns code bytes also */
            if (*outbytesleft < 4){
                st->_errno = errno = E2BIG;
                return (-1);
            }

            **outbuf = ESC;
            *(*outbuf+1) = SS3;
            (*outbuf) += 2;
            (*outbytesleft) -= 2;

            break;

        default: /* Should have been taken care of in caller of this funcion */

            /* This means that the cnscode is invalid. Should have been taken
               care of in function hascns() and thus this code should never
               come here. We catch this by the error message below */
            fprintf(stderr, "big5_to_iso():Big5->ISO2022-CN-EXT:Rejecting cnscode\n");
            st->_errno = errno = EILSEQ;
            return (0);

            break;

        } /* end switch */

        /* Output the cns code */
        if (*outbytesleft < 2){
            st->_errno = errno = E2BIG;
            return (-1);
        }

        **outbuf = cnsbytes[0];
        *(*outbuf+1) = cnsbytes[1];
        (*outbuf) += 2;
        (*outbytesleft) -= 2;


        return (0);

}


static int getcnsbytes(int idx, char *cnsbytes, int *cnsplane)
{

        unsigned long cnscode;
        unsigned long val;
        int plane;

        cnscode = big5_cns_tab[idx].value;

        plane = (int) (cnscode >> 16);
        switch (plane) {
        case 0x21:      /* 0x8EA1 - G */
        case 0x22:      /* 0x8EA2 - H */
        case 0x23:      /* 0x8EA3 - I */
        case 0x24:      /* 0x8EA4 - J */
        case 0x25:      /* 0x8EA5 - K */
        case 0x26:      /* 0x8EA6 - L */
        case 0x27:      /* 0x8EA7 - M */
        case 0x28:      /* 0x8EA8 - N */
        case 0x29:      /* 0x8EA9 - O */
        case 0x2a:      /* 0x8EAA - P */
        case 0x2b:      /* 0x8EAB - Q */
        case 0x2c:      /* 0x8EAC - R */
        case 0x2d:      /* 0x8EAD - S */
        case 0x2f:      /* 0x8EAF - U */
        case 0x30:      /* 0x8EB0 - V */
            *cnsplane = plane - 0x20;   /* so that we can use GET_PLANEC() */
            break;

        case 0x2e:      /* 0x8EAE - T */
            *cnsplane = 3;              /* CNS 11643-1992. Why is this returning 3?  */
            break;

        default:
            return (-1); /* Should not have happened */
            break;
        }

        val = cnscode & 0xffff;
        cnsbytes[0] = (val & 0xff00) >> 8;
        cnsbytes[1] = val & 0xff;

        return (0);

}


/* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
static int binsearch(unsigned long x, table_t v[], int n)
{
        int low, high, mid;

        low = 0;
        high = n - 1;
        while (low <= high) {
            mid = (low + high) / 2;
            if (x < v[mid].key)
                high = mid - 1;
            else if (x > v[mid].key)
                low = mid + 1;
            else        /* found match */
                return mid;
        }
        return (-1);    /* no match */
}