#include <sys/types.h>
#if defined(JFP_ICONV_FROMCODE_UTF32BE)||defined(JFP_ICONV_FROMCODE_UTF32LE)
#define JFP_ICONV_FROMCODE_UTF32
#endif
#if defined(JFP_ICONV_FROMCODE_UTF16BE)||defined(JFP_ICONV_FROMCODE_UTF16LE)
#define JFP_ICONV_FROMCODE_UTF16
#endif
#if defined(JFP_ICONV_FROMCODE_UCS2BE)||defined(JFP_ICONV_FROMCODE_UCS2LE)
#define JFP_ICONV_FROMCODE_UCS2
#endif
#if defined(JFP_ICONV_TOCODE_UTF32BE)||defined(JFP_ICONV_TOCODE_UTF32LE)
#define JFP_ICONV_TOCODE_UTF32
#endif
#if defined(JFP_ICONV_TOCODE_UTF16BE)||defined(JFP_ICONV_TOCODE_UTF16LE)
#define JFP_ICONV_TOCODE_UTF16
#endif
#if defined(JFP_ICONV_TOCODE_UCS2BE)||defined(JFP_ICONV_TOCODE_UCS2LE)
#define JFP_ICONV_TOCODE_UCS2
#endif
#define BOM 0xfeff
#define BSBOM16 0xfffe
#define BSBOM32 0xfffe0000
#define REPLACE 0xfffd
#define IFHISUR(x) ((0xd800 <= (x)) && ((x) <= 0xdbff))
#define IFLOSUR(x) ((0xdc00 <= (x)) && ((x) <= 0xdfff))
typedef struct {
boolean_t bom_written;
boolean_t little_endian;
} ucs_state_t;
#if defined(JFP_ICONV_FROMCODE_UTF32)
static size_t
read_unicode(
unsigned int *p,
unsigned char **pip,
size_t *pileft,
ucs_state_t *state)
{
unsigned char *ip = *pip;
size_t ileft = *pileft;
size_t rv = (size_t)0;
unsigned char ic1, ic2, ic3, ic4;
unsigned int u32;
NGET(ic1, "UTF32-1");
NGET(ic2, "UTF32-2");
NGET(ic3, "UTF32-3");
NGET(ic4, "UTF32-4");
if (state->bom_written == B_FALSE) {
u32 = 0U;
u32 |= (unsigned int)ic1 << 24;
u32 |= (unsigned int)ic2 << 16;
u32 |= (unsigned int)ic3 << 8;
u32 |= (unsigned int)ic4 << 0;
if (u32 == BOM) {
state->bom_written = B_TRUE;
state->little_endian = B_FALSE;
*p = BOM;
rv = (size_t)0;
goto ret;
} else if (u32 == BSBOM32) {
state->bom_written = B_TRUE;
state->little_endian = B_TRUE;
*p = BOM;
rv = (size_t)0;
goto ret;
} else {
state->bom_written = B_TRUE;
}
}
if (state->little_endian == B_TRUE) {
u32 = 0U;
u32 |= (unsigned int)ic1 << 0;
u32 |= (unsigned int)ic2 << 8;
u32 |= (unsigned int)ic3 << 16;
u32 |= (unsigned int)ic4 << 24;
} else {
u32 = 0U;
u32 |= (unsigned int)ic1 << 24;
u32 |= (unsigned int)ic2 << 16;
u32 |= (unsigned int)ic3 << 8;
u32 |= (unsigned int)ic4 << 0;
}
if (u32 == BSBOM32) {
RETERROR(EILSEQ, "byte-swapped BOM detected")
}
if ((u32 == 0xfffe) || (u32 == 0xffff) || (u32 > 0x10ffff)
|| IFHISUR(u32) || IFLOSUR(u32)) {
RETERROR(EILSEQ, "illegal in UTF-32")
}
*p = u32;
rv = *pileft - ileft;
ret:
if (rv != (size_t)-1) {
*pip = ip;
*pileft = ileft;
}
return (rv);
}
#elif defined(JFP_ICONV_FROMCODE_UTF16) || defined(JFP_ICONV_FROMCODE_UCS2)
static size_t
read_unicode(
unsigned int *p,
unsigned char **pip,
size_t *pileft,
ucs_state_t *state)
{
unsigned char *ip = *pip;
size_t ileft = *pileft;
size_t rv = (size_t)0;
unsigned char ic1, ic2;
unsigned int u32;
#ifndef JFP_ICONV_FROMCODE_UCS2
unsigned int losur;
#endif
NGET(ic1, "UTF16-1");
NGET(ic2, "UTF16-2");
if (state->bom_written == B_FALSE) {
u32 = 0U;
u32 |= (unsigned int)ic1 << 8;
u32 |= (unsigned int)ic2 << 0;
if (u32 == BOM) {
state->bom_written = B_TRUE;
state->little_endian = B_FALSE;
*p = BOM;
rv = (size_t)0;
goto ret;
} else if (u32 == BSBOM16) {
state->bom_written = B_TRUE;
state->little_endian = B_TRUE;
*p = BOM;
rv = (size_t)0;
goto ret;
} else {
state->bom_written = B_TRUE;
}
}
if (state->little_endian == B_TRUE) {
u32 = (((unsigned int)ic2) << 8) | ic1;
} else {
u32 = (((unsigned int)ic1) << 8) | ic2;
}
if (u32 == BSBOM16) {
RETERROR(EILSEQ, "byte-swapped BOM detected")
}
if ((u32 == 0xfffe) || (u32 == 0xffff) || (u32 > 0x10ffff)
|| (IFLOSUR(u32))) {
RETERROR(EILSEQ, "illegal in UTF16")
}
if (IFHISUR(u32)) {
#if defined(JFP_ICONV_FROMCODE_UCS2)
RETERROR(EILSEQ, "surrogate is illegal in UCS2")
#else
NGET(ic1, "LOSUR-1");
NGET(ic2, "LOSUR-2");
if (state->little_endian == B_TRUE) {
losur = (((unsigned int)ic2) << 8) | ic1;
} else {
losur = (((unsigned int)ic1) << 8) | ic2;
}
if (IFLOSUR(losur)) {
u32 = ((u32 - 0xd800) * 0x400)
+ (losur - 0xdc00) + 0x10000;
} else {
RETERROR(EILSEQ, "low-surrogate expected")
}
#endif
}
*p = u32;
rv = *pileft - ileft;
ret:
if (rv != (size_t)-1) {
*pip = ip;
*pileft = ileft;
}
return (rv);
}
#else
static const char remaining_bytes_tbl[0x100] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const char masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
static const unsigned char valid_min_2nd_byte[0x100] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
};
static const unsigned char valid_max_2nd_byte[0x100] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
};
static size_t
utf8_ucs(unsigned int *p, unsigned char **pip, size_t *pileft)
{
unsigned int l;
unsigned char ic;
unsigned char ic1;
unsigned char *ip = *pip;
size_t ileft = *pileft;
size_t rv = (size_t)0;
int remaining_bytes;
NGET(ic, "no bytes available");
ic1 = ic;
l = ic1;
if (ic1 < 0x80) {
*p = l;
rv = *pileft - ileft;
goto ret;
}
remaining_bytes = remaining_bytes_tbl[ic1];
if (remaining_bytes != 0) {
l &= masks_tbl[remaining_bytes];
for (; remaining_bytes > 0; remaining_bytes--) {
if (ic1 != 0U) {
NGET(ic, "2nd byte of UTF-8");
if ((ic < valid_min_2nd_byte[ic1]) ||
(ic > valid_max_2nd_byte[ic1])) {
RETERROR(EILSEQ, "2nd byte is invalid")
}
ic1 = 0U;
} else {
NGET(ic, "3rd or later byte of UTF-8");
if ((ic < 0x80) || (ic > 0xbf)) {
RETERROR(EILSEQ, "3rd or later byte is invalid")
}
}
l = (l << 6) | (ic & 0x3f);
}
*p = l;
rv = *pileft - ileft;
goto ret;
} else {
RETERROR(EILSEQ, "1st byte is invalid")
}
ret:
if (rv != (size_t)-1) {
*pip = ip;
*pileft = ileft;
}
return (rv);
}
static size_t
read_unicode(
unsigned int *p,
unsigned char **pip,
size_t *pileft,
ucs_state_t *state)
{
return (utf8_ucs(p, pip, pileft));
}
#endif
#if defined(JFP_ICONV_TOCODE_UTF32)
static size_t
write_unicode(
unsigned int u32,
char **pop,
size_t *poleft,
ucs_state_t *state,
const char *msg)
{
char *op = *pop;
size_t oleft = *poleft;
size_t rv = (size_t)0;
unsigned char ic1, ic2, ic3, ic4;
if (state->bom_written == B_FALSE) {
if (state->little_endian == B_TRUE) {
ic1 = (unsigned char)((BOM >> 0) & 0xff);
ic2 = (unsigned char)((BOM >> 8) & 0xff);
ic3 = (unsigned char)((BOM >> 16) & 0xff);
ic4 = (unsigned char)((BOM >> 24) & 0xff);
} else {
ic1 = (unsigned char)((BOM >> 24) & 0xff);
ic2 = (unsigned char)((BOM >> 16) & 0xff);
ic3 = (unsigned char)((BOM >> 8) & 0xff);
ic4 = (unsigned char)((BOM >> 0) & 0xff);
}
rv += 4;
NPUT(ic1, "BOM32-1")
NPUT(ic2, "BOM32-2")
NPUT(ic3, "BOM32-3")
NPUT(ic4, "BOM32-4")
}
if (state->little_endian == B_TRUE) {
ic1 = (unsigned char)((u32 >> 0) & 0xff);
ic2 = (unsigned char)((u32 >> 8) & 0xff);
ic3 = (unsigned char)((u32 >> 16) & 0xff);
ic4 = (unsigned char)((u32 >> 24) & 0xff);
rv += 4;
} else {
ic1 = (unsigned char)((u32 >> 24) & 0xff);
ic2 = (unsigned char)((u32 >> 16) & 0xff);
ic3 = (unsigned char)((u32 >> 8) & 0xff);
ic4 = (unsigned char)((u32 >> 0) & 0xff);
rv += 4;
}
NPUT(ic1, "UTF32-1")
NPUT(ic2, "UTF32-2")
NPUT(ic3, "UTF32-3")
NPUT(ic4, "UTF32-4")
ret:
if (rv != (size_t)-1) {
*pop = op;
*poleft = oleft;
if (state->bom_written == B_FALSE)
state->bom_written = B_TRUE;
}
return (rv);
}
#elif defined(JFP_ICONV_TOCODE_UTF16) || defined(JFP_ICONV_TOCODE_UCS2)
static size_t
write_unicode(
unsigned int u32,
char **pop,
size_t *poleft,
ucs_state_t *state,
const char *msg)
{
char *op = *pop;
size_t oleft = *poleft;
size_t rv = (size_t)0;
unsigned char ic1, ic2;
unsigned int losur = 0U;
if (state->bom_written == B_FALSE) {
if (state->little_endian == B_TRUE) {
ic1 = (unsigned char)((BOM >> 0) & 0xff);
ic2 = (unsigned char)((BOM >> 8) & 0xff);
} else {
ic1 = (unsigned char)((BOM >> 8) & 0xff);
ic2 = (unsigned char)((BOM >> 0) & 0xff);
}
rv += 2;
NPUT(ic1, "BOM16-1")
NPUT(ic2, "BOM16-2")
}
if (u32 > 0xffff) {
#if defined(JFP_ICONV_TOCODE_UCS2)
u32 = REPLACE;
#else
losur = ((u32 - 0x10000) % 0x400) + 0xdc00;
u32 = ((u32 - 0x10000) / 0x400) + 0xd800;
#endif
}
if (state->little_endian == B_TRUE) {
ic1 = (unsigned char)(u32 & 0xff);
ic2 = (unsigned char)((u32 >> 8) & 0xff);
rv += 2;
} else {
ic1 = (unsigned char)((u32 >> 8) & 0xff);
ic2 = (unsigned char)(u32 & 0xff);
rv += 2;
}
NPUT(ic1, "UTF16-1")
NPUT(ic2, "UTF16-2")
if (losur != 0U) {
if (state->little_endian == B_TRUE) {
ic1 = (unsigned char)(losur & 0xff);
ic2 = (unsigned char)((losur >> 8) & 0xff);
rv += 2;
} else {
ic1 = (unsigned char)((losur >> 8) & 0xff);
ic2 = (unsigned char)(losur & 0xff);
rv += 2;
}
NPUT(ic1, "LOSUR-1")
NPUT(ic2, "LOSUR-2")
}
ret:
if (rv != (size_t)-1) {
*pop = op;
*poleft = oleft;
if (state->bom_written == B_FALSE)
state->bom_written = B_TRUE;
}
return (rv);
}
#else
static size_t
write_unicode(
unsigned int u32,
char **pop,
size_t *poleft,
ucs_state_t *state,
const char *msg)
{
char *op = *pop;
size_t oleft = *poleft;
size_t rv = 0;
if (u32 <= 0x7f) {
NPUT((unsigned char)(u32), msg);
rv = 1;
} else if (u32 <= 0x7ff) {
NPUT((unsigned char)((((u32)>>6) & 0x1f) | 0xc0), msg);
NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg);
rv = 2;
} else if ((u32 >= 0xd800) && (u32 <= 0xdfff)) {
RETERROR(EILSEQ, "surrogate in UTF-8")
} else if (u32 <= 0xffff) {
NPUT((unsigned char)((((u32)>>12) & 0x0f) | 0xe0), msg);
NPUT((unsigned char)((((u32)>>6) & 0x3f) | 0x80), msg);
NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg);
rv = 3;
} else if (u32 <= 0x10ffff) {
NPUT((unsigned char)((((u32)>>18) & 0x07) | 0xf0), msg);
NPUT((unsigned char)((((u32)>>12) & 0x3f) | 0x80), msg);
NPUT((unsigned char)((((u32)>>6) & 0x3f) | 0x80), msg);
NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg);
rv = 4;
} else {
RETERROR(EILSEQ, "beyond range of UTF-8")
}
ret:
if (rv != (size_t)-1) {
*pop = op;
*poleft = oleft;
}
return (rv);
}
#endif
#define GETU(pu32) \
switch (read_unicode(pu32, &ip, &ileft, (ucs_state_t *)cd)) { \
case (size_t)-1: \
\
rv = (size_t)-1; \
goto ret; \
case (size_t)0: \
\
\
rv = (size_t)0; \
goto next; \
default: \
break; \
}
#define PUTU(u32, msg) \
if (write_unicode(u32, &op, &oleft, (ucs_state_t *)cd, msg) \
== (size_t)-1) { \
rv = ((size_t)-1);\
goto ret; \
}
#include <stdlib.h>
static void
_icv_reset_unicode(void *cd)
{
ucs_state_t *state = (ucs_state_t *)cd;
#if defined(JFP_ICONV_FROMCODE_UTF32BE) || \
defined(JFP_ICONV_TOCODE_UTF32BE) || \
defined(JFP_ICONV_FROMCODE_UTF16BE) || \
defined(JFP_ICONV_TOCODE_UTF16BE) || \
defined(JFP_ICONV_FROMCODE_UCS2BE) || \
defined(JFP_ICONV_TOCODE_UCS2BE)
state->little_endian = B_FALSE;
state->bom_written = B_TRUE;
#elif defined(JFP_ICONV_FROMCODE_UTF32LE) || \
defined(JFP_ICONV_TOCODE_UTF32LE) || \
defined(JFP_ICONV_FROMCODE_UTF16LE) || \
defined(JFP_ICONV_TOCODE_UTF16LE) || \
defined(JFP_ICONV_FROMCODE_UCS2LE) || \
defined(JFP_ICONV_TOCODE_UCS2LE)
state->little_endian = B_TRUE;
state->bom_written = B_TRUE;
#elif defined(_LITTLE_ENDIAN)
state->little_endian = B_TRUE;
state->bom_written = B_FALSE;
#elif defined(_BIG_ENDIAN)
state->little_endian = B_FALSE;
state->bom_written = B_FALSE;
#endif
return;
}
static void *
_icv_open_unicode(size_t extsize)
{
ucs_state_t *cd;
if ((cd = (ucs_state_t *)calloc(1,
sizeof (ucs_state_t) + extsize)) == NULL) {
errno = ENOMEM;
return ((void *)-1);
}
_icv_reset_unicode((void *)cd);
return ((void *)cd);
}
static void
_icv_close_unicode(void *cd)
{
if (cd == NULL) {
errno = EBADF;
} else {
free(cd);
}
return;
}
static void *
_icv_get_ext(void *cd)
{
return ((void *)((unsigned char *)cd + sizeof (ucs_state_t)));
}