#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <errno.h>
#ifdef DEBUG
#include <sys/fcntl.h>
#include <sys/stat.h>
#endif
#include <public_struc.h>
#include <gb2312_unicode.h>
#include <cns11643_unicode_CN.h>
#define MSB 0x80
#define MBYTE 0x8e
#define PMASK 0xa0
#define ONEBYTE 0xff
#define MSB_OFF 0x7f
#define SI 0x0f
#define SO 0x0e
#define ESC 0x1b
#define SS2 0x4e
#define SS3 0x4f
#define UTF8_NON_ID_CHAR1 0xEF
#define UTF8_NON_ID_CHAR2 0xBF
#define UTF8_NON_ID_CHAR3 0xBD
typedef struct _icv_state {
char _buf[10];
size_t _bufcont;
char _keepc[4];
short _gstate;
short _istate;
int _plane;
int _last_plane;
int _errno;
} _iconv_st;
enum _GSTATE { G0, G1, G2, G3, G4, G5, G6, G7, G8, G9, \
G10,G11,G12,G13,G14,G15,G16,G17,G18,G19, \
G20,G21,G22,G23,G24,G25,G26,G27,G28,G29 };
enum _ISTATE { IN, OUT };
int iso_to_utf8(_iconv_st * st, char* buf, size_t buflen);
int binsearch(unsigned long x, table_t v[], int n);
int flush_buf(_iconv_st * st, char ** outbuf, size_t * outbytesleft);
int flush_buf(_iconv_st * st, char ** outbuf, size_t * outbytesleft) {
if (!st->_bufcont)
return 0;
if (st->_bufcont > *outbytesleft) {
st->_errno = E2BIG;
return -1;
}
strncpy(st->_buf, *outbuf, st->_bufcont);
(*outbuf)+=(st->_bufcont);
(*outbytesleft)-=(st->_bufcont);
st->_bufcont = 0;
return st->_bufcont;
}
void *
_icv_open()
{
_iconv_st *st;
if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
errno = ENOMEM;
return ((void *) -1);
}
st->_gstate = G0;
st->_istate = IN;
st->_last_plane = st->_plane = -1;
st->_errno = 0;
st->_bufcont = 0;
return ((void *) st);
}
void
_icv_close(_iconv_st *st)
{
if (st == NULL)
errno = EBADF;
else
free(st);
}
size_t _icv_iconv(_iconv_st *st, \
char **inbuf, size_t *inbytesleft, \
char **outbuf, size_t *outbytesleft) {
int n;
char c;
if (st == NULL) {
errno = EBADF;
return ((size_t) -1);
}
if (inbuf == NULL || *inbuf == NULL) {
st->_errno = 0;
return ((size_t) 0);
}
errno = st->_errno = 0;
while (*inbytesleft > 0 && *outbytesleft > 0) {
switch (st->_gstate) {
case G0:
if (**inbuf == ESC) {
st->_gstate = G1;
st->_buf[st->_bufcont++] = ESC;
} else {
**outbuf = **inbuf;
(*outbuf)++;
(*outbytesleft)--;
}
break;
case G1:
if (**inbuf == '$') {
st->_gstate = G2;
st->_buf[st->_bufcont++] = '$';
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_errno = 0;
st->_istate = IN;
continue;
}
break;
case G2:
if (**inbuf == ')') {
st->_gstate = G3;
} else if (**inbuf == '*') {
st->_gstate = G12;
st->_plane = 2;
} else if (**inbuf == '+') {
st->_gstate = G19;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_errno = 0;
st->_istate = IN;
continue;
}
st->_buf[st->_bufcont++] = **inbuf;
break;
case G3:
if (**inbuf == 'A') {
st->_plane = 0;
st->_gstate = G4;
} else if (**inbuf == 'G') {
st->_plane = 1;
st->_gstate = G8;
} else if (**inbuf == 'H') {
st->_plane = 2;
st->_gstate = G8;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_errno = 0;
st->_istate = IN;
continue;
}
st->_buf[st->_bufcont++] = **inbuf;
break;
case G4:
if (**inbuf == SO) {
st->_gstate = G5;
st->_istate = OUT;
st->_bufcont = 0;
st->_last_plane = st->_plane;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_errno = 0;
st->_istate = IN;
st->_plane = st->_last_plane;
continue;
}
break;
case G5:
if (**inbuf == SI) {
st->_istate = IN;
st->_gstate = G7;
st->_last_plane = st->_plane;
} else if (**inbuf == ESC) {
st->_bufcont = 0;
st->_gstate = G0;
continue;
} else {
st->_keepc[0] = **inbuf;
st->_gstate = G6;
}
break;
case G6:
st->_keepc[1] = **inbuf;
n = iso_to_utf8(st, *outbuf, *outbytesleft);
if (n > 0) {
(*outbuf) += n;
(*outbytesleft) -= n;
} else {
errno = st->_errno;
return (size_t)-1;
}
st->_gstate = G5;
break;
case G7:
if (**inbuf == SO) {
st->_gstate = G5;
st->_istate = OUT;
st->_last_plane = st->_plane;
st->_bufcont = 0;
} else if (**inbuf == ESC) {
st->_gstate = G0;
continue;
} else {
**outbuf = **inbuf;
(*outbuf)++;
(*outbytesleft) --;
}
break;
case G8:
if (**inbuf == SO) {
st->_istate = OUT;
st->_gstate = G9;
st->_bufcont = 0;
st->_last_plane = st->_plane;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_errno = 0;
st->_plane = st->_last_plane;
st->_istate = IN;
continue;
}
break;
case G9:
if (**inbuf == SI) {
st->_istate = IN;
st->_gstate = G11;
st->_last_plane = st->_plane;
} else if (**inbuf == ESC) {
if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
}
st->_gstate = G0;
continue;
} else {
st->_keepc[0] = **inbuf;
st->_gstate = G10;
}
break;
case G10:
st->_keepc[1] = **inbuf;
n = iso_to_utf8(st, *outbuf, *outbytesleft);
if (n > 0) {
(*outbuf) += n;
(*outbytesleft) -= n;
} else {
errno = st->_errno;
return (size_t)-1;
}
st->_gstate = G9;
break;
case G11:
st->_bufcont = 0;
if (**inbuf == SO) {
st->_istate = OUT;
st->_gstate = G9;
} else if (**inbuf == ESC) {
st->_gstate = G0;
continue;
} else {
**outbuf = **inbuf;
(*outbuf)++;
(*outbytesleft)--;
}
break;
case G12:
if (**inbuf == 'H') {
st->_buf[st->_bufcont++] = 'H';
st->_gstate = G13;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_istate = IN;
st->_plane = st->_last_plane;
st->_gstate = G0;
continue;
}
break;
case G13:
if (**inbuf == ESC) {
st->_buf[st->_bufcont++] = **inbuf;
st->_gstate = G14;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_istate = IN;
st->_plane = st->_last_plane;
continue;
}
break;
case G14:
if (**inbuf == SS2) {
st->_istate = OUT;
st->_gstate = G15;
st->_bufcont = 0;
st->_last_plane = st->_plane = 2;
} else if (**inbuf == '$') {
st->_bufcont --;
if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G1;
st->_plane = st->_last_plane;
st->_istate = IN;
continue;
}
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_istate = IN;
st->_plane = st->_last_plane;
continue;
}
break;
case G15:
if (**inbuf == SI) {
st->_gstate = G16;
st->_istate = IN;
st->_last_plane = st->_plane;
} else if (**inbuf == ESC) {
st->_bufcont = 0;
st->_gstate = G0;
continue;
} else {
st->_keepc[0] = **inbuf;
st->_gstate = G18;
}
break;
case G16:
if (**inbuf == ESC) {
st->_gstate = G17;
st->_buf[st->_bufcont++] = ESC;
} else {
**outbuf = **inbuf;
(*outbuf) ++;
(*outbytesleft) --;
st->_bufcont = 0;
}
break;
case G17:
if (**inbuf == '$') {
st->_gstate = G1;
st->_buf[st->_bufcont++] = '$';
continue;
} else if (**inbuf == SS2) {
st->_bufcont = 0;
st->_gstate = G15;
st->_istate = OUT;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G16;
st->_istate = IN;
}
break;
case G18:
st->_keepc[1] = **inbuf;
st->_gstate = G15;
if ((n = iso_to_utf8(st, \
*outbuf, \
*outbytesleft)) > 0) {
(*outbuf)+=n;
(*outbytesleft)-=n;
} else {
errno = st->_errno;
return (size_t)-1;
}
break;
case G19:
c = **inbuf;
if (c == 'I' || \
c == 'J' || \
c == 'K' || \
c == 'L' || \
c == 'M' || \
c == 'N' || \
c == 'O' || \
c == 'P' || \
c == 'Q' || \
c == 'R' || \
c == 'S' || \
c == 'T' || \
c == 'U' || \
c == 'V') {
st->_plane = c - 'I' + 3;
st->_gstate = G20;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_errno = 0;
st->_istate = IN;
st->_plane = st->_last_plane;
continue;
}
st->_buf[st->_bufcont++] = c;
break;
case G20:
if (**inbuf == ESC) {
st->_buf[st->_bufcont++] = **inbuf;
st->_gstate = G21;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_istate = IN;
st->_last_plane = st->_plane;
continue;
}
break;
case G21:
if (**inbuf == SS3) {
st->_istate = OUT;
st->_gstate = G22;
st->_bufcont = 0;
} else if (**inbuf == '$') {
st->_bufcont --;
if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_istate = IN;
st->_last_plane = st->_plane;
st->_gstate = G1;
continue;
}
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G0;
st->_istate = IN;
st->_last_plane = st->_plane;
continue;
}
break;
case G22:
if (**inbuf == SI) {
st->_istate = IN;
st->_gstate = G24;
st->_last_plane = st->_plane;
} else {
st->_keepc[0] = (char)MBYTE;
st->_keepc[1] = (char)(PMASK + st->_plane);
st->_keepc[2] = **inbuf;
st->_gstate = G23;
}
break;
case G23:
st->_keepc[3] = **inbuf;
if ((n = iso_to_utf8(st, \
*outbuf, \
*outbytesleft)) > 0) {
(*outbuf)+=n;
(*outbytesleft-=n);
} else {
errno = st->_errno;
return (size_t)-1;
}
st->_gstate = G22;
break;
case G24:
if (**inbuf == ESC) {
st->_gstate = G25;
st->_buf[st->_bufcont++] = ESC;
} else {
**outbuf = **inbuf;
(*outbuf)++;
(*outbytesleft)--;
st->_bufcont = 0;
}
break;
case G25:
if (**inbuf == '$') {
st->_gstate = G1;
continue;
} else if (**inbuf == SS3) {
st->_gstate = G22;
st->_bufcont = 0;
st->_istate = OUT;
} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
errno = st->_errno;
return (size_t)-1;
} else {
st->_gstate = G24;
st->_istate = IN;
}
break;
default:
st->_errno = errno = EILSEQ;
st->_gstate = G0;
break;
}
(*inbuf)++;
(*inbytesleft)--;
if (st->_errno) {
break;
}
if (errno)
return((size_t)(-1));
}
if (*inbytesleft > 0 && *outbytesleft == 0) {
errno = E2BIG;
return((size_t)(-1));
}
return (size_t)((*inbytesleft));
}
int iso_to_utf8(_iconv_st * st, char* buf, size_t buflen) {
unsigned long iso_val;
int unidx;
unsigned long unicode;
switch (st->_plane) {
case 0:
unidx = (((st->_keepc[0] & 0xff) - 0x21) * 94) + \
(st->_keepc[1] & 0xff) - 0x21;
if (unidx < 0 || unidx >= GBMAX) {
st->_errno = EILSEQ;
return (0);
}
unicode = Unicode[unidx];
break;
case 1:
iso_val = ((st->_keepc[0] & MSB_OFF) << 8) + \
(st->_keepc[1] & MSB_OFF);
unidx = binsearch(iso_val, cns1_utf_tab, MAX_CNS1_NUM);
if (unidx >= 0) {
unicode = cns1_utf_tab[unidx].value;
break;
}
st->_errno = EILSEQ;
return (0);
case 2:
iso_val = ((st->_keepc[0] & MSB_OFF) << 8) + \
(st->_keepc[1] & MSB_OFF);
unidx = binsearch(iso_val, cns2_utf_tab, MAX_CNS2_NUM);
if (unidx >= 0) {
unicode = cns2_utf_tab[unidx].value;
break;
}
st->_errno = EILSEQ;
return (0);
default:
st->_errno = EILSEQ;
return (0);
}
if (unidx >=0) {
if (unicode > 0x0080 && unicode <= 0x07ff) {
if (buflen < 2) {
st->_errno = E2BIG;
return (0);
}
*buf = (char)((unicode >> 6) & 0x1f) | 0xc0;
*(buf+1) = (char)(unicode & 0x3f) | 0x80;
return 2;
}
if (unicode > 0x0800 && unicode <= 0xffff) {
if (buflen < 3) {
st->_errno = E2BIG;
return (0);
}
*buf = (char)((unicode >> 12) & 0xf) | 0xe0;
*(buf+1) = (char)((unicode >>6) & 0x3f) | 0x80;
*(buf+2) = (char)(unicode & 0x3f) | 0x80;
return 3;
}
}
if (buflen < 3) {
st->_errno = E2BIG;
return (0);
}
*buf = (char)UTF8_NON_ID_CHAR1;
*(buf + 2) = (char)UTF8_NON_ID_CHAR2;
*(buf + 2) = (char)UTF8_NON_ID_CHAR3;
return (3);
}
int binsearch(unsigned long x, table_t v[], int n)
{
int low, high, mid;
low = 0;
high = n - 1;
while (low <= high) {
mid = (low + high) / 2;
if (x < v[mid].key)
high = mid - 1;
else if (x > v[mid].key)
low = mid + 1;
else
return mid;
}
return (-1);
}
#ifdef DEBUG
main(int argc, char ** argv) {
char *inbuf, *outbuf, *in_tmp, *out_tmp;
size_t inbytesleft, outbytesleft;
int fd;
int i;
struct stat s;
_iconv_st * st;
if (argc < 2) {
fprintf(stderr, "Usage: %s input\n", argv[0]);
exit(-1);
}
if ((fd = open(argv[1], O_RDONLY)) == -1) {
perror("open");
exit(-2);
}
if (fstat(fd, &s) == -1) {
perror("stat");
exit(-3);
}
inbytesleft = outbytesleft = s.st_size;
in_tmp = inbuf = (char *)malloc(inbytesleft);
out_tmp = outbuf = (char *)malloc(outbytesleft);
if (!inbuf || !outbuf) {
perror("malloc");
exit(-1);
}
if (read(fd, inbuf, inbytesleft) != inbytesleft) {
perror("read");
exit(-4);
}
for (i = 0; i < inbytesleft; i++)
fprintf(stderr, "%x\t", *(inbuf+i));
fprintf(stderr, "\n");
st = (_iconv_st *)_icv_open();
if (st == (_iconv_st *) -1) {
perror("_icv_open");
exit(-1);
}
if (_icv_iconv(st, \
&inbuf, &inbytesleft, \
&outbuf, &outbytesleft) == -1) {
perror("icv_iconv");
fprintf(stderr, "\ninbytesleft = %d\n", inbytesleft);
exit(-2);
}
if (write(1, out_tmp, s.st_size - outbytesleft) == -1) {
perror("write");
exit(-1);
}
free(in_tmp);
free(out_tmp);
close(fd);
_icv_close(st);
}
#endif