| /**********************************************************************
|
| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| **********************************************************************/
|
| /*
|
| * HISTORY
|
| * $Log: iconv_gen.c,v $
|
| * Revision 1.1.19.1 2001/12/07 15:37:53
|
| * * Add checking for the CONV_NO_UDC flag to disable mapping to UDC.
|
| * * Merge codes from kernel/bsd.
|
| *
|
| * Revision 1.1.15.2 2000/12/11 19:41:22
|
| * Enable table mapping for private use area character if 0 is returned
|
| * from the UDC function.
|
| *
|
| * Revision 1.1.15.1 2000/10/16 18:44:52
|
| * COSIX.Zulu to Yankee merge for GB18030 support.
|
| *
|
| * Revision 1.1.13.1 2000/08/07 14:33:48
|
| * Support GB18030 and map UTF-8 surrogate pair to the right UCS code.
|
| *
|
| * Revision 1.1.11.2 2000/01/19 19:56:15
|
| * Fix cut & paste error in input_ucs2().
|
| * [2000/01/14 14:38:48 Long_Man]
|
| *
|
| * Revision 1.1.11.1 2000/01/13 20:25:46
|
| * Remove the old conversion routine in favor of the enhanced
|
| * __<codeset>_index() routine, and add algorithmic UDC conversion
|
| * support. Also add a number of conversion routines to support
|
| * special font charset to Unicode conversion.
|
| *
|
| * Revision 1.1.9.4 1998/08/21 17:51:25
|
| * Fix iconv dump problem when converting ISO8859-1 to UCS-2.
|
| * [1998/08/20 18:50:33 Long_Man]
|
| *
|
| * Revision 1.1.9.3 1998/05/04 15:21:21
|
| * Fix typo error in output_ucs2().
|
| * [1998/05/04 14:54:26 Long_Man]
|
| *
|
| * Revision 1.1.9.2 1998/03/26 18:56:28
|
| * Prepend __ to global functions to prevent namespace pollution
|
| * and fix problem in UCS-2 and UCS-4 routines.
|
| * [1998/03/23 14:43:44 Long_Man]
|
| *
|
| * Revision 1.1.5.6 1997/06/17 21:29:46
|
| * Fix QAR 53565 by guarding against the boundary case.
|
| * [1997/06/16 21:37:44 Long_Man]
|
| *
|
| * Revision 1.1.5.5 1997/04/07 19:04:57
|
| * Fix QAR 52035 by adjusting input pointer to failure location.
|
| * [1997/04/03 22:40:55 Long_Man]
|
| *
|
| * Revision 1.1.5.4 1997/02/24 21:01:51
|
| * Fix UCS locale build by not writing BOM if it causes E2BIG error.
|
| * [1997/02/21 21:36:41 Long_Man]
|
| *
|
| * Fix QAR 51653: Missing first character in UCS-2 output if BOM enabled.
|
| * [1997/02/21 14:58:45 Long_Man]
|
| *
|
| * Revision 1.1.5.3 1997/01/07 15:58:47
|
| * Add new converters for cp437 and cp850 codeset support.
|
| * [1996/12/24 16:15:44 Long_Man]
|
| *
|
| * Revision 1.1.5.2 1996/11/22 17:02:32
|
| * Improve execution speed & other minor bug fixes.
|
| * [1996/11/12 17:19:26 Long_Man]
|
| *
|
| * Consolidate UCS iconv converter binaries & support UCS-2.
|
| * [1996/10/28 21:11:39 Long_Man]
|
| *
|
| * Revision 1.1.2.4 1995/07/31 14:37:37
|
| * QAR 35010 - Verify the input buffer contains a full
|
| * wchar_t for processing from UCS-4/UTF-8 to a target
|
| * codeset. If inlen < wchar_t, set errno=EINVAL and
|
| * return ICONV_TRUNC.
|
| * [1995/07/12 14:41:34 Bill_Fountas]
|
| *
|
| * Fix pointer and length settings for E2BIG errors
|
| * [1995/06/27 13:17:11 Bill_Fountas]
|
| *
|
| * Revision 1.1.2.3 1995/06/30 14:45:11
|
| * Fix pointer and length settings for E2BIG errors
|
| * [1995/06/30 13:31:45 Bill_Fountas]
|
| *
|
| * Revision 1.1.2.2 1995/06/07 16:12:48
|
| * Initial iconv Unicode support check in
|
| * [1995/06/02 14:59:11 Kelly_Mulheren]
|
| *
|
| * $EndLog$
|
| */
|
|
|
| #ifndef _KERNEL
|
| #include <string.h>
|
| #else
|
| #include <stddef.h>
|
| #endif
|
| /*#include <sys/malloc.h> //BRL & JAC */
|
| #include "fcconv.h"
|
| #include "multi-byte.h"
|
| /*
|
| * Macros to reverse the byte ordering
|
| */
|
| #define REVERSE_UCS4_BYTE(ucs4) ((((ucs4) & 0x000000ff) << 24) | \
|
| (((ucs4) & 0x0000ff00) << 8) | \
|
| (((ucs4) & 0x00ff0000) >> 8) | \
|
| (((ucs4) & 0xff000000) >> 24))
|
| #define REVERSE_UCS2_BYTE(ucs2) ((((ucs2) & 0x00ff) << 8) | \
|
| (((ucs2) & 0xff00) >> 8))
|
|
|
| /*
|
| * Macro to set error status and returned value
|
| */
|
| #ifdef DONT_NEED_THIS /* JAC */
|
| #ifndef _KERNEL
|
| #define SET_ERR_RETURN(error) \
|
| { \
|
| switch (error) \
|
| { \
|
| case ERR_INVALID_CHAR : \
|
| error = EILSEQ ; retval = ICONV_INVAL ; break ; \
|
| case ERR_INPUT_INCOMPLETE: \
|
| error = EINVAL ; retval = ICONV_TRUNC ; break ; \
|
| case ERR_BUFFER_OVERRUN : \
|
| error = E2BIG ; retval = ICONV_OVER ; break ; \
|
| } \
|
| _Seterrno(error ) ; \
|
| return (retval) ; \
|
| }
|
| #else
|
| #define SET_ERR_RETURN(error) \
|
| { \
|
| switch (error) \
|
| { \
|
| case ERR_INVALID_CHAR : \
|
| error = EILSEQ ; retval = ICONV_INVAL ; break ; \
|
| case ERR_INPUT_INCOMPLETE: \
|
| error = EINVAL ; retval = ICONV_TRUNC ; break ; \
|
| case ERR_BUFFER_OVERRUN : \
|
| error = E2BIG ; retval = ICONV_OVER ; break ; \
|
| } \
|
| return (retval) ; \
|
| }
|
| #endif
|
| #endif /* DONT_NEED_THIS - JAC */
|
|
|
| /**********
|
| * __from_ucs_exec
|
| *
|
| * Driver routine for converting from UCS/UTF.
|
| **********/
|
| #ifdef USING_OPEN_SOURCE_from_ucs_exec /* JAC */
|
| NA_EIDPROC
|
| int
|
| __from_ucs_exec (_LC_fcconv_iconv_t *cd,
|
| uchar_t** in_buff , size_t *in_bytes_left ,
|
| uchar_t** out_buff, size_t *out_bytes_left)
|
| {
|
| uchar_t *outptr ; /* Pointer to output buffer */
|
| uchar_t *inptr ; /* Pointer to input buffer */
|
| uchar_t *inptr2 ; /* Pointer to input buffer */
|
| size_t outlen ; /* Number of outbytes left */
|
| size_t inlen ; /* Number of inbytes left */
|
| WChar_t new_char ; /* converted character */ /*JAC */
|
| int inword ; /* low word of input */
|
| unsigned int char_size ; /* Output size of MB chars */ /*BRL & JAC */
|
| int error ; /* Error code */
|
| int retval ; /* Return value */
|
| int idx ;
|
| int nomap ; /* No ASCII mapping */
|
| int d2map ; /* ASCII direct map */
|
| int cfirst ; /* Conversion function first */
|
| int cnext ; /* Conversion function next */
|
| cfunc_t conv ;
|
|
|
| if (!in_buff)
|
| return(ICONV_DONE) ;
|
|
|
| outptr = *out_buff ;
|
| outlen = *out_bytes_left ;
|
| inptr = *in_buff ;
|
| inlen = *in_bytes_left ;
|
| inptr2 = inptr ;
|
| error = 0 ;
|
| nomap = cd->flags & CONV_ASCII_ONOMAP ;
|
| d2map = cd->flags & CONV_ASCII_ODMAP2 ;
|
| cfirst = cd->flags & CONV_FUNC_FIRST ;
|
| cnext = cd->flags & CONV_FUNC_NEXT ;
|
| conv = (cfunc_t)cd->outfunc ;
|
|
|
| /**********
|
| * perform conversion
|
| **********/
|
| for ( ; inlen > 0 ; inlen -= inptr2 - inptr, inptr = inptr2) {
|
| inword = (*cd->infunc)(cd, &inptr2, inlen) ;
|
|
|
| if (inword < 0) {
|
| error = inword ;
|
| break ;
|
| }
|
| /**********
|
| * Translate input word to a new character
|
| **********/
|
| if ((nomap || d2map) && _ISASCII(inword))
|
| {
|
| new_char = nomap ? inword : cd->ocell2_tab[0][inword] ;
|
| }
|
| else if (UCS_UDC(inword)) {
|
| /*
|
| * Map UCS UDC character to the corresponding multibyte
|
| * character, if applicable, and CONV_NO_UDC flag not defined.
|
| */
|
| new_char = (cd->udcfunc && !(cd->flags & CONV_NO_UDC))
|
| ? (*cd->udcfunc)(inword) : BAD ;
|
| /*
|
| * Do a table mapping if the UDC function returns 0
|
| */
|
| if (new_char == 0)
|
| {
|
| GET_OVAL(cd, inword, new_char) ;
|
| }
|
| }
|
| else if (cfirst && conv)
|
| {
|
| /*
|
| * Invoke the conversion function first before doing a table
|
| * lookup.
|
| */
|
| if ((new_char = (*conv)(inword)) == BAD)
|
| {
|
| if (cd->maxucs)
|
| if (inword > cd->maxucs)
|
| {
|
| error = ERR_INVALID_CHAR ; /* Invalid sequence */
|
| break ;
|
| }
|
| GET_OVAL(cd, inword, new_char) ;
|
| }
|
| }
|
| else
|
| {
|
| if (cd->maxucs)
|
| if (inword > cd->maxucs)
|
| {
|
| error = ERR_INVALID_CHAR ; /* Invalid sequence */
|
| break ;
|
| }
|
| GET_OVAL(cd, inword, new_char) ;
|
|
|
| /**********
|
| * invoke special converion function if defined
|
| **********/
|
| if (conv)
|
| {
|
| /*
|
| * If cnext is defined, pass input word through conv function
|
| * if table lookup fails. Otherwise, pass the new character
|
| * to conversion function to modify it.
|
| */
|
| if (cnext)
|
| {
|
| if (new_char == BAD)
|
| new_char = (*conv)(inword) ;
|
| }
|
| else if (new_char != BAD)
|
| new_char = (*conv)(new_char) ;
|
| }
|
| }
|
| /**********
|
| * valid character?
|
| **********/
|
| if (new_char == BAD) {
|
| if (cd->defchar != 0)
|
| new_char = cd->defchar ;
|
| else if (cd->defstr) {
|
| if (cd->defstrlen == 0)
|
| continue ; /* Skip the invalid character */
|
| if ((size_t)cd->defstrlen > outlen) { /*BRL & JAC */
|
| error = ERR_BUFFER_OVERRUN ; /* Output buf overflow */
|
| break ;
|
| }
|
| memcpy(outptr, cd->defstr, cd->defstrlen) ;
|
| outptr += cd->defstrlen ;
|
| outlen -= cd->defstrlen ;
|
| continue ;
|
| }
|
| else if (inword == 0xFFFD) { /* Replacement character ? */
|
| /*
|
| * Skip replacement character, if found
|
| */
|
| continue ;
|
| }
|
| else {
|
| error = ERR_INVALID_CHAR ;
|
| break ;
|
| }
|
| }
|
|
|
| /**********
|
| * calculate the character size in byte
|
| **********/
|
| if ((new_char & 0xffffff00) == 0)
|
| char_size = 1 ;
|
| else if ((new_char & 0xffff0000) == 0)
|
| char_size = 2 ;
|
| else if ((new_char & 0xff000000) == 0)
|
| char_size = 3 ;
|
| else
|
| char_size = 4 ;
|
|
|
| /**********
|
| * have we exceeded size of output buffer?
|
| **********/
|
| if (outlen < char_size) {
|
| error = ERR_BUFFER_OVERRUN ;
|
| break ;
|
| }
|
|
|
| /**********
|
| * Output the bytes
|
| **********/
|
| switch (char_size) {
|
| case 4: *outptr++ = (new_char >> 24) & 0xff;
|
| case 3: *outptr++ = (new_char >> 16) & 0xff;
|
| case 2: *outptr++ = (new_char >> 8) & 0xff;
|
| case 1: *outptr++ = new_char & 0xff;
|
| break ;
|
| }
|
| outlen -= char_size ;
|
| }
|
|
|
|
|
| /**********
|
| * set output parameters
|
| **********/
|
| *in_buff = inptr ;
|
| *out_buff = outptr ;
|
| *in_bytes_left = inlen ;
|
| *out_bytes_left = outlen ;
|
| if (!error)
|
| return(ICONV_DONE) ;
|
| SET_ERR_RETURN(error) ;
|
| } /* __from_ucs_exec */
|
| #endif /* USING_OPEN_SOURCE_from_ucs_exec // JAC */
|
|
|
| /**********
|
| * __sb_to_ucs_exec
|
| *
|
| * Driver routine for converting from single-byte to UCS/UTF.
|
| **********/
|
|
|
| #ifdef USING_OPEN_SOURCE_sb_to_ucs_exec /* JAC */
|
| NA_EIDPROC
|
| int
|
| __sb_to_ucs_exec(_LC_fcconv_iconv_t *cd,
|
| uchar_t** in_buff , size_t *in_bytes_left ,
|
| uchar_t** out_buff, size_t *out_bytes_left)
|
| {
|
| uchar_t *outptr ; /* Pointer to output buffer */
|
| uchar_t *inptr ; /* Pointer to input buffer */
|
| size_t outlen ; /* Number of outbytes left */
|
| size_t inlen ; /* Number of inbytes left */
|
| WChar_t new_char ; /* converted character */ /*JAC */
|
| WChar_t input_ch ; /* converted character */ /*JAC */
|
| int retval ;
|
| int error ; /* Error code */
|
| int nomap ; /* No ASCII mapping */
|
| int d2map ; /* ASCII direct map */
|
|
|
| if (!in_buff)
|
| return ICONV_DONE;
|
|
|
| inptr = *in_buff ;
|
| inlen = *in_bytes_left ;
|
| outptr = *out_buff ;
|
| outlen = *out_bytes_left ;
|
| error = 0 ;
|
| nomap = cd->flags & CONV_ASCII_INOMAP ;
|
| d2map = cd->flags & CONV_ASCII_IDMAP2 ;
|
|
|
| /**********
|
| * perform conversion
|
| **********/
|
| for ( ; inlen > 0 ; inptr++, inlen--) {
|
| input_ch = *inptr ;
|
| if ((nomap || d2map) && _ISASCII(input_ch))
|
| new_char = nomap ? input_ch : cd->icell2_tab[0][input_ch] ;
|
| else
|
| {
|
| GET_IVAL(cd, input_ch, new_char) ;
|
| /*
|
| * Restrict output to less than ICONV_MAXUCS, if defined
|
| */
|
| if (cd->maxucs && (new_char != BAD))
|
| if (new_char > cd->maxucs)
|
| new_char = BAD ;
|
|
|
| /**********
|
| * valid character?
|
| **********/
|
| if (new_char == BAD) {
|
| if (cd->defucsch)
|
| new_char = cd->defucsch ;
|
| else if (cd->defstr && (cd->defstrlen == 0))
|
| continue ; /* Skip that character */
|
| else {
|
| error = ERR_INVALID_CHAR ;
|
| break ;
|
| }
|
| }
|
| }
|
| retval = (*cd->outfunc)(cd, outptr, outlen, new_char) ;
|
| if (retval < 0)
|
| {
|
| error = retval ;
|
| break ;
|
| }
|
| outptr += retval ;
|
| outlen -= retval ;
|
| }
|
|
|
| /**********
|
| * set output parameters
|
| **********/
|
| *in_buff = inptr ;
|
| *out_buff = outptr ;
|
| *in_bytes_left = inlen ;
|
| *out_bytes_left = outlen ;
|
| if (!error)
|
| return(ICONV_DONE) ;
|
|
|
| SET_ERR_RETURN(error) ;
|
| } /* __sb_to_ucs_exec */
|
| #endif /* USING_OPEN_SOURCE_sb_to_ucs_exec // JAC */
|
|
|
| /**********
|
| * __sb_to_sb_exec
|
| *
|
| * Driver routine for converting from single-byte to single-byte via UCS.
|
| **********/
|
|
|
| #ifdef USING_OPEN_SOURCE_sb_to_sb_exec /* JAC */
|
| NA_EIDPROC
|
| int
|
| __sb_to_sb_exec(_LC_fcconv_iconv_t *cd,
|
| uchar_t** in_buff , size_t *in_bytes_left ,
|
| uchar_t** out_buff, size_t *out_bytes_left)
|
| {
|
| uchar_t *inptr ; /* Pointer to input buffer */
|
| uchar_t *outptr ; /* Pointer to output buffer */
|
| size_t inlen ; /* Number of inbytes left */
|
| size_t outlen ; /* Number of outbytes left */
|
| WChar_t new_char ; /* converted character */ /*JAC */
|
| WChar_t input_ch ; /* Input character */ /*JAC */
|
| int retval ; /* Function return value */
|
| int error ; /* Error code */
|
| int inomap ; /* No input ASCII mapping */
|
| int id2map ; /* INput ASCII direct map */
|
| int onomap ; /* No output ASCII mapping */
|
| int od2map ; /* Output ASCII direct map */
|
|
|
| if (!in_buff)
|
| return ICONV_DONE;
|
|
|
| inptr = *in_buff ;
|
| inlen = *in_bytes_left ;
|
| outptr = *out_buff ;
|
| outlen = *out_bytes_left ;
|
| error = 0 ;
|
| inomap = cd->flags & CONV_ASCII_INOMAP ;
|
| id2map = cd->flags & CONV_ASCII_IDMAP2 ;
|
| onomap = cd->flags & CONV_ASCII_ONOMAP ;
|
| od2map = cd->flags & CONV_ASCII_ODMAP2 ;
|
|
|
| /**********
|
| * perform conversion
|
| **********/
|
| while (inlen > 0) {
|
|
|
| input_ch = *inptr ;
|
| if ((inomap || id2map) && _ISASCII(input_ch))
|
| new_char = inomap ? input_ch : cd->icell2_tab[0][input_ch] ;
|
| else
|
| {
|
| GET_IVAL(cd, input_ch, new_char) ;
|
|
|
| /**********
|
| * valid character?
|
| **********/
|
| if (new_char == BAD) {
|
| if (cd->defucsch)
|
| new_char = cd->defucsch ;
|
| else if (cd->defstr && (cd->defstrlen == 0)) {
|
| inptr++, inlen-- ;
|
| continue ; /* Skip that character */
|
| }
|
| else if (cd->defstrlen > 0) {
|
| if (outlen < cd->defstrlen) {
|
| /* Not enough output buffer */
|
| error = ERR_BUFFER_OVERRUN ;
|
| break ;
|
| }
|
| /* Copy default string to output */
|
| bcopy(cd->defstr, (char *)outptr, cd->defstrlen) ;
|
| inptr++, inlen-- ;
|
| outptr += cd->defstrlen ;
|
| outlen -= cd->defstrlen ;
|
| continue ;
|
| }
|
| else {
|
| error = ERR_INVALID_CHAR ;
|
| break ;
|
| }
|
| }
|
| }
|
|
|
| if (((ssize_t)outlen) <= 0) {
|
| error = ERR_BUFFER_OVERRUN ;
|
| break ;
|
| }
|
|
|
| input_ch = new_char ;
|
| if ((onomap || od2map) && _ISASCII(input_ch))
|
| new_char = onomap ? input_ch : cd->ocell2_tab[0][input_ch] ;
|
| else
|
| {
|
| GET_OVAL(cd, input_ch, new_char) ;
|
|
|
| /**********
|
| * valid character?
|
| **********/
|
| if (new_char == BAD) {
|
| if (cd->defchar)
|
| new_char = cd->defchar ;
|
| else if (cd->defstr && (cd->defstrlen == 0)) {
|
| inptr++, inlen-- ;
|
| continue ; /* Skip that charcter */
|
| }
|
| else if (cd->defstrlen > 0) {
|
| if (outlen < cd->defstrlen) {
|
| /* Not enough output buffer */
|
| error = ERR_BUFFER_OVERRUN ;
|
| break ;
|
| }
|
| /* Copy default string to output */
|
| bcopy(cd->defstr, (char *)outptr, cd->defstrlen) ;
|
| inptr++, inlen-- ;
|
| outptr += cd->defstrlen ;
|
| outlen -= cd->defstrlen ;
|
| continue ;
|
| }
|
| else {
|
| error = ERR_INVALID_CHAR ;
|
| break ;
|
| }
|
| }
|
| }
|
| *outptr = new_char ;
|
| inptr ++, inlen -- ;
|
| outptr++, outlen-- ;
|
| }
|
|
|
| /**********
|
| * set output parameters
|
| **********/
|
| *in_buff = inptr ;
|
| *out_buff = outptr ;
|
| *in_bytes_left = inlen ;
|
| *out_bytes_left = outlen ;
|
| if (!error)
|
| return(ICONV_DONE) ;
|
|
|
| SET_ERR_RETURN(error) ;
|
| } /* __sb_to_sb_exec */
|
| #endif /* USING_OPEN_SOURCE_sb_to_sb_exec // JAC */
|
|
|
| /**********
|
| * __to_ucs_exec
|
| *
|
| * Generic driver routine for converting from any character set to UCS.
|
| * It is assumed that ASCII is a proper subset of the character set except
|
| * for UCS characters.
|
| **********/
|
|
|
| #ifdef USING_OPEN_SOURCE_to_ucs_exec /* JAC */
|
| NA_EIDPROC
|
| int
|
| __to_ucs_exec(_LC_fcconv_iconv_t *cd,
|
| uchar_t** in_buff , size_t *in_bytes_left ,
|
| uchar_t** out_buff, size_t *out_bytes_left)
|
| {
|
| uchar_t *inptr ;
|
| uchar_t *inptr2 ;
|
| uchar_t *outptr ;
|
| size_t inlen ;
|
| size_t outlen ;
|
| int inword ;
|
| WChar_t outword ; /*JAC */
|
| int retval ;
|
| int error ; /* Error code */
|
| int chkasc ; /* Check for ASCII */
|
| int inomap ; /* No mapping needed for ASCII */
|
|
|
| if (!in_buff)
|
| return ICONV_DONE;
|
|
|
| inptr = *in_buff ;
|
| inlen = *in_bytes_left ;
|
| outptr = *out_buff ;
|
| outlen = *out_bytes_left ;
|
| inptr2 = inptr ;
|
| error = 0 ;
|
| chkasc = cd->flags & (CONV_ASCII_INOMAP|CONV_ASCII_IDMAP2) ;
|
| inomap = cd->flags & CONV_ASCII_INOMAP ;
|
|
|
| /**********
|
| * Perform conversion
|
| **********/
|
| if (cd->srccode == UCS) while (inlen > 0) {
|
| /*
|
| * infunc will return UCS-4 value
|
| */
|
| if ((inword = (*cd->infunc)(cd, &inptr2, inlen)) < 0)
|
| {
|
| error = inword ;
|
| break ;
|
| }
|
| retval = (*cd->outfunc)(cd, outptr, outlen, inword) ;
|
| if (retval < 0)
|
| {
|
| error = retval ;
|
| break ;
|
| }
|
| inlen -= inptr2 - inptr ;
|
| inptr = inptr2 ;
|
| outlen -= retval ;
|
| outptr += retval ;
|
| }
|
| else for ( ; inlen > 0 ; inlen -= inptr2 - inptr, inptr = inptr2) {
|
| if (chkasc && _ISASCII(inword = *inptr2))
|
| {
|
| inptr2++ ;
|
| outword = inomap ? inword : cd->icell2_tab[0][inword] ;
|
| }
|
| else
|
| {
|
| /*
|
| * infunc will return UCS-4 table index
|
| */
|
| if ((inword = (*cd->infunc)(cd, &inptr2, inlen)) < 0)
|
| {
|
| error = inword ;
|
| break ;
|
| }
|
| /*
|
| * Convert table index into UCS-4
|
| */
|
| if (inword == BAD)
|
| outword = BAD ;
|
| else if (IS_UCODE(inword))
|
| outword = GET_UCODE(inword) ;
|
| else
|
| {
|
| /*
|
| * Check for UDC
|
| */
|
| if (ISIDXU(inword))
|
| outword = IDXU_UCS(inword) ;
|
| else
|
| {
|
| GET_IVAL(cd, inword, outword)
|
| }
|
| /*
|
| * Restrict output to less than ICONV_MAXUCS, if defined
|
| */
|
| if (cd->maxucs && (outword != BAD))
|
| if (outword > cd->maxucs)
|
| outword = BAD ;
|
| }
|
|
|
| if ((outword == BAD) ||
|
| ((cd->flags & CONV_NO_UDC) && UCS_UDC(outword))) {
|
| if (cd->defucsch != 0)
|
| outword = cd->defucsch ;
|
| else if (cd->defstr && (cd->defstrlen == 0))
|
| continue ; /* Ignore this character */
|
| else
|
| /*
|
| * Invalid character
|
| * Setup indata & adjust error position as
|
| * there may be a E2BIG error before that.
|
| */
|
| {
|
| error = ERR_INVALID_CHAR ;
|
| break ;
|
| }
|
| }
|
| }
|
|
|
| retval = (*cd->outfunc)(cd, outptr, outlen, outword) ;
|
| if (retval < 0)
|
| {
|
| error = retval ;
|
| break ;
|
| }
|
| outlen -= retval ;
|
| outptr += retval ;
|
| }
|
|
|
| /**********
|
| * set output parameters
|
| **********/
|
| *in_buff = inptr ;
|
| *out_buff = outptr ;
|
| *in_bytes_left = inlen ;
|
| *out_bytes_left = outlen ;
|
| if (!error)
|
| return(ICONV_DONE) ;
|
|
|
| SET_ERR_RETURN(error) ;
|
| } /* __to_ucs_exec */
|
| #endif /* USING_OPEN_SOURCE_to_ucs_exec // JAC */
|
|
|
| /**********
|
| * __cs_to_ucs_exec
|
| *
|
| * Special driver routine for converting from character set which may
|
| * not contain ASCII to UCS.
|
| **********/
|
|
|
| #ifdef USING_OPEN_SOURCE_cs_to_ucs_exec /* JAC */
|
| NA_EIDPROC
|
| int
|
| __cs_to_ucs_exec(_LC_fcconv_iconv_t *cd,
|
| uchar_t** in_buff , size_t *in_bytes_left ,
|
| uchar_t** out_buff, size_t *out_bytes_left)
|
| {
|
| uchar_t *inptr ;
|
| uchar_t *inptr2 ;
|
| uchar_t *outptr ;
|
| size_t inlen ;
|
| size_t outlen ;
|
| int inword ;
|
| WChar_t outword ; /*JAC */
|
| int error ; /* Error code */
|
| int retval ;
|
|
|
| if (!in_buff)
|
| return ICONV_DONE;
|
|
|
| inptr = *in_buff ;
|
| inlen = *in_bytes_left ;
|
| outptr = *out_buff ;
|
| outlen = *out_bytes_left ;
|
| inptr2 = inptr ;
|
| error = 0 ;
|
|
|
| /**********
|
| * Perform conversion
|
| **********/
|
| for ( ; inlen > 0 ; inlen -= inptr2 - inptr, inptr = inptr2) {
|
| /*
|
| * infunc will return UCS-4 table index
|
| */
|
| if ((inword = (*cd->infunc)(cd, &inptr2, inlen)) < 0)
|
| {
|
| error = inword ;
|
| break ;
|
| }
|
| /*
|
| * Convert table index into UCS-4
|
| */
|
| if (inword != BAD)
|
| {
|
| /*
|
| * Check for UDC
|
| */
|
| if (ISIDXU(inword))
|
| outword = IDXU_UCS(inword) ;
|
| else
|
| {
|
| GET_IVAL(cd, inword, outword)
|
| }
|
| /*
|
| * Restrict output to less than ICONV_MAXUCS, if defined
|
| */
|
| if (cd->maxucs && (outword != BAD))
|
| if (outword > cd->maxucs)
|
| outword = BAD ;
|
| }
|
| else
|
| outword = BAD ;
|
|
|
| if ((outword == BAD) ||
|
| ((cd->flags & CONV_NO_UDC) && UCS_UDC(outword))) {
|
| if (cd->defucsch != 0)
|
| outword = cd->defucsch ;
|
| else if (cd->defstr && (cd->defstrlen == 0))
|
| continue ; /* Ignore this character */
|
| else
|
| /*
|
| * Invalid character
|
| * Setup indata & adjust error position as
|
| * there may be a E2BIG error before that.
|
| */
|
| {
|
| error = ERR_INVALID_CHAR ;
|
| break ;
|
| }
|
| }
|
|
|
| retval = (*cd->outfunc)(cd, outptr, outlen, outword) ;
|
| if (retval < 0)
|
| {
|
| error = retval ;
|
| break ;
|
| }
|
| outlen -= retval ;
|
| outptr += retval ;
|
| }
|
|
|
| /**********
|
| * set output parameters
|
| **********/
|
| *in_buff = inptr ;
|
| *out_buff = outptr ;
|
| *in_bytes_left = inlen ;
|
| *out_bytes_left = outlen ;
|
| if (!error)
|
| return(ICONV_DONE) ;
|
|
|
| SET_ERR_RETURN(error) ;
|
| } /* __cs_to_ucs_exec */
|
| #endif /* USING_OPEN_SOURCE_cs_to_ucs_exec // JAC */
|
|
|
| /**********
|
| * __mb_to_mb_exec
|
| *
|
| * Driver routine for converting from multi-byte to multi-byte via UCS.
|
| **********/
|
|
|
| #ifdef USING_OPEN_SOURCE_mb_to_mb_exec /* JAC */
|
| NA_EIDPROC
|
| int
|
| __mb_to_mb_exec(_LC_fcconv_iconv_t *cd,
|
| uchar_t** in_buff , size_t *in_bytes_left ,
|
| uchar_t** out_buff, size_t *out_bytes_left)
|
| {
|
| uchar_t *inptr ;
|
| uchar_t *inptr2 ;
|
| uchar_t *outptr ;
|
| size_t inlen ;
|
| size_t outlen ;
|
| int inword ;
|
| WChar_t outword ; /*JAC */
|
| int char_size; /* Multi-byte char size */
|
| int error ; /* Error code */
|
| int retval ;
|
| int inomap ; /* No input ASCII mapping */
|
| int id2map ; /* Input ASCII direct map */
|
| int onomap ; /* No output ASCII mapping */
|
| int od2map ; /* Output ASCII direct map */
|
| int idx ;
|
|
|
| if (!in_buff)
|
| return ICONV_DONE;
|
|
|
| inptr = *in_buff ;
|
| inptr2 = inptr ;
|
| inlen = *in_bytes_left ;
|
| outptr = *out_buff ;
|
| outlen = *out_bytes_left ;
|
| inomap = cd->flags & CONV_ASCII_INOMAP ;
|
| id2map = cd->flags & CONV_ASCII_IDMAP2 ;
|
| onomap = cd->flags & CONV_ASCII_ONOMAP ;
|
| od2map = cd->flags & CONV_ASCII_ODMAP2 ;
|
|
|
| /**********
|
| * perform conversion
|
| **********/
|
| for ( ; inlen > 0 ; inlen -= inptr2 - inptr, inptr = inptr2) {
|
| /*
|
| * infunc will return UCS-4 table index
|
| */
|
| if ((inword = (*cd->infunc)(cd, &inptr2, inlen)) < 0)
|
| {
|
| error = inword ;
|
| break ;
|
| }
|
|
|
| /*
|
| * Convert table index into UCS-4
|
| */
|
| if ((inomap || id2map) && _ISASCII(inword))
|
| outword = inomap ? inword : cd->icell2_tab[0][inword] ;
|
| else
|
| {
|
| if (inword == BAD)
|
| outword = BAD ;
|
| else if (IS_UCODE(inword))
|
| outword = GET_UCODE(inword) ;
|
| else
|
| {
|
| /*
|
| * Check for UDC
|
| */
|
| if (ISIDXU(inword))
|
| outword = IDXU_UCS(inword) ;
|
| else
|
| {
|
| GET_IVAL(cd, inword, outword)
|
| }
|
| /*
|
| * Restrict output to less than ICONV_MAXUCS, if defined
|
| */
|
| if (cd->maxucs && (outword != BAD))
|
| if (outword > cd->maxucs)
|
| outword = BAD ;
|
| }
|
|
|
| if (outword == BAD) {
|
| if (cd->defucsch != 0)
|
| outword = cd->defucsch ;
|
| else if (cd->defstr && (cd->defstrlen == 0))
|
| continue ; /* Skip this character */
|
| else if (cd->defstrlen > 0) {
|
| if (outlen < cd->defstrlen) {
|
| error = ERR_BUFFER_OVERRUN ;
|
| break ;
|
| }
|
| bcopy(cd->defstr, (char *)outptr, cd->defstrlen) ;
|
| outptr += cd->defstrlen ;
|
| outlen -= cd->defstrlen ;
|
| continue ;
|
| }
|
| else
|
| /*
|
| * Invalid character
|
| */
|
| {
|
| error = ERR_INVALID_CHAR ;
|
| break ;
|
| }
|
| }
|
| }
|
|
|
| /*
|
| * Convert UCS-4 into output multibyte character
|
| */
|
| inword = outword ;
|
| if ((onomap || od2map) && _ISASCII(inword))
|
| outword = onomap ? inword : cd->ocell2_tab[0][inword] ;
|
| else if (UCS_UDC(inword))
|
| {
|
| /*
|
| * Map UCS UDC character to the corresponding multiple UDC
|
| * character, if applicable
|
| */
|
| outword = cd->udcfunc ? (*cd->udcfunc)(inword) : BAD ;
|
| }
|
| else
|
| {
|
| GET_OVAL(cd, inword, outword) ;
|
| if (outword == BAD) {
|
| if (cd->defchar)
|
| outword = cd->defchar ;
|
| else if ((cd->defstr) && (cd->defstrlen == 0))
|
| continue ; /* Skip this character */
|
| else if (cd->defstrlen > 0)
|
| {
|
| if (outlen < cd->defstrlen) {
|
| error = ERR_BUFFER_OVERRUN ;
|
| break ;
|
| }
|
| bcopy(cd->defstr, (char *)outptr, cd->defstrlen) ;
|
| outptr += cd->defstrlen ;
|
| outlen -= cd->defstrlen ;
|
| continue ;
|
| }
|
| else
|
| /*
|
| * Invalid character
|
| */
|
| {
|
| error = ERR_INVALID_CHAR ;
|
| break ;
|
| }
|
| }
|
| }
|
|
|
| /*
|
| * calculate the character size in byte
|
| */
|
| if ((outword & 0xffffff00) == 0)
|
| char_size = 1 ;
|
| else if ((outword & 0xffff0000) == 0)
|
| char_size = 2 ;
|
| else if ((outword & 0xff000000) == 0)
|
| char_size = 3 ;
|
| else
|
| char_size = 4 ;
|
|
|
| /*
|
| * Check for output buffer overflow
|
| */
|
| if (outlen < char_size) {
|
| error = ERR_BUFFER_OVERRUN ;
|
| break ;
|
| }
|
|
|
| /*
|
| * Output the bytes
|
| */
|
| switch (char_size) {
|
| case 4: *outptr++ = (outword >> 24) & 0xff;
|
| case 3: *outptr++ = (outword >> 16) & 0xff;
|
| case 2: *outptr++ = (outword >> 8) & 0xff;
|
| case 1: *outptr++ = outword & 0xff;
|
| break ;
|
| }
|
| outlen -= char_size ;
|
| }
|
|
|
| /**********
|
| * set output parameters
|
| **********/
|
| *in_buff = inptr ;
|
| *out_buff = outptr ;
|
| *in_bytes_left = inlen ;
|
| *out_bytes_left = outlen ;
|
| if (!error)
|
| return(ICONV_DONE) ;
|
|
|
| SET_ERR_RETURN(error) ;
|
| } /* __mb_to_mb_exec */
|
| #endif /* USING_OPEN_SOURCE_mb_to_mb_exec // JAC */
|
|
|
| /************************************************************************/
|
| /* */
|
| /* Macros, constants & type definitions for */
|
| /* UCS-2/UCS-4/UTF-7/UTF-8 input/output routines */
|
| /* */
|
| /************************************************************************/
|
|
|
| /*
|
| * Surrogates related macros
|
| *
|
| * The surrogates Area consists of 1024 low-half surrogates and 1024 high-half
|
| * surrogates which are interpreted in pairs to access over a million codes.
|
| * The high surrogate characters are encoded in the range U+D800 -> U+DBFF.
|
| * The high surrogate character is always the first element of a
|
| * surrogate-pair.
|
| * The low surrogate characters are encoded in the range U+DC00 -> U+DFFF.
|
| * The low surrogate character is always the second element of a
|
| * surrogate-pair.
|
| * These code values are drawn from planes 1-16 of group 0 of UCS-4, that is,
|
| * the range of UCS-4 code values 0x010000 - 0x10ffff. The last two planes
|
| & (15 & 16) are reserved for private use.
|
| */
|
| #define MAX_UCS4_VALUE 0x10ffff /* Max UCS-4 value for surrogates */
|
| #define SURROGATE_MASK 0x03ff
|
| #define HSURROGATE_PL 0xd800
|
| #define LSURROGATE_PL 0xdc00
|
| #define IS_HSURROGATE(ucs2) (((ucs2) &~SURROGATE_MASK) == HSURROGATE_PL )
|
| #define IS_LSURROGATE(ucs2) (((ucs2) &~SURROGATE_MASK) == LSURROGATE_PL )
|
| #define GET_LSURROGATE(ucs4) (((ucs4) & SURROGATE_MASK) | LSURROGATE_PL )
|
| #define GET_HSURROGATE(ucs4) (((((ucs4)-0x10000) >> 10) & SURROGATE_MASK) \
|
| | HSURROGATE_PL )
|
| #define SURROGATES_TO_UCS4(high, low) ((((low) & SURROGATE_MASK) | \
|
| (((high ) & SURROGATE_MASK) << 10)) + \
|
| 0x10000)
|
| #define NEED_SURROGATE(ucs4) ((0x10000 <= (ucs4)) && ((ucs4) < 0x110000))
|
|
|
| /*
|
| * UTF-8 related macros
|
| *
|
| * UTF-8 is a variable length encoding of Unicode using 8-bit sequences,
|
| * where the high bits indicate which part of the sequence a byte belongs to.
|
| * The following table shows how the bits in a Unicode value (or surrogate
|
| * pair) are distributed among the bytes in the UTF-8 encoding.
|
| *
|
| * Unicode value 1st byte 2nd byte 3rd byte 4th byte
|
| * ------------- -------- -------- -------- --------
|
| * 000000000gfedcba 0gfedcba
|
| *
|
| * 00000kjihgfedcba 110kjihg 10fedcba
|
| *
|
| * ponmlkjihgfedcba 1110ponm 10lkjihg 10fedcba
|
| *
|
| * 110110jihgfedcba 11110UTS 10RQponm 10lkjihg 10fedcba
|
| * 110111tsrqponmlk
|
| * where UTSRQ = tsrq + 1
|
| *
|
| * The following table shows the format of the first octet of a coded
|
| * character; the free bits available for coding the character are
|
| * indicated by an x. [Note 2]
|
| *
|
| * Octets Binary Bits Free Max. UCS-4
|
| * 1st of 1 0xxxxxxx 7 0000 007F
|
| * 1st of 2 110xxxxx 5 0000 07FF
|
| * 1st of 3 1110xxxx 4 0000 FFFF
|
| * 1st of 4 11110xxx 3 001F FFFF
|
| * 1st of 5 111110xx 2 03FF FFFF
|
| * 1st of 6 1111110x 1 7FFF FFFF
|
| * 2nd .. nth 10xxxxxx 6
|
| */
|
| #define MIN_2BYTE_UTF8 0x0000080
|
| #define MIN_3BYTE_UTF8 0x0000800
|
| #define MIN_4BYTE_UTF8 0x0010000
|
| #define MIN_5BYTE_UTF8 0x0200000
|
| #define MIN_6BYTE_UTF8 0x4000000
|
| #define IS_1BYTE_UTF8(ch) ( (ch) < 0x80)
|
| #define IS_2BYTE_UTF8(ch) (((ch) & 0xe0) == 0xc0)
|
| #define IS_3BYTE_UTF8(ch) (((ch) & 0xf0) == 0xe0)
|
| #define IS_4BYTE_UTF8(ch) (((ch) & 0xf8) == 0xf0)
|
| #define IS_5BYTE_UTF8(ch) (((ch) & 0xfc) == 0xf8)
|
| #define IS_6BYTE_UTF8(ch) (((ch) & 0xfe) == 0xfc)
|
| #define IS_UCS_TO_1B_UTF8(ucs) ((ucs) < 0x80 ) /* <= 1 byte UTF-8 */
|
| #define IS_UCS_TO_2B_UTF8(ucs) ((ucs) < 0x800 ) /* <= 2 bytes UTF-8 */
|
| #define IS_UCS_TO_3B_UTF8(ucs) ((ucs) < 0x10000 ) /* <= 3 bytes UTF-8 */
|
| #define IS_UCS_TO_4B_UTF8(ucs) ((ucs) < 0x200000 ) /* <= 4 bytes UTF-8 */
|
| #define IS_UCS_TO_5B_UTF8(ucs) ((ucs) < 0x4000000 ) /* <= 5 bytes UTF-8 */
|
| #define IS_UCS_TO_6B_UTF8(ucs) ((ucs) < 0x80000000L) /* <= 6 bytes UTF-8 */
|
| #define RD_1BYTE_UTF8(in) (in)[0]
|
| #define RD_2BYTE_UTF8(in) (((in)[1] & 0x3f) | (((in)[0] & 0x1f) << 6 ))
|
| #define RD_3BYTE_UTF8(in) (((in)[2] & 0x3f) | (((in)[1] & 0x3f) << 6 ) \
|
| | (((in)[0] & 0x0f) << 12))
|
| #define RD_4BYTE_UTF8(in) (((in)[3] & 0x3f) | (((in)[2] & 0x3f) << 6 ) \
|
| | (((in)[1] & 0x3f) << 12) \
|
| | (((in)[0] & 0x07) << 18))
|
| #define RD_5BYTE_UTF8(in) (((in)[4] & 0x3f) | (((in)[3] & 0x3f) << 6 ) \
|
| | (((in)[2] & 0x3f) << 12) \
|
| | (((in)[1] & 0x3f) << 18) \
|
| | (((in)[0] & 0x03) << 24))
|
| #define RD_6BYTE_UTF8(in) (((in)[5] & 0x3f) | (((in)[4] & 0x3f) << 6 ) \
|
| | (((in)[3] & 0x3f) << 12) \
|
| | (((in)[2] & 0x3f) << 18) \
|
| | (((in)[1] & 0x3f) << 24) \
|
| | (((in)[0] & 0x01) << 30))
|
| #define WR_1BYTE_UTF8(out, ucs) { *(out)++ = (ucs) & 0x7f ; }
|
| #define WR_2BYTE_UTF8(out, ucs) { *(out)++ = (((ucs) >> 6) & 0x1f) | 0xc0 ; \
|
| *(out)++ = ( (ucs) & 0x3f) | 0x80 ; }
|
| #define WR_3BYTE_UTF8(out, ucs) { *(out)++ = (((ucs) >> 12) & 0x0f) | 0xe0 ; \
|
| *(out)++ = (((ucs) >> 6) & 0x3f) | 0x80 ; \
|
| *(out)++ = ( (ucs) & 0x3f) | 0x80 ; }
|
| #define WR_4BYTE_UTF8(out, ucs) { *(out)++ = (((ucs) >> 18) & 0x07) | 0xf0 ; \
|
| *(out)++ = (((ucs) >> 12) & 0x3f) | 0x80 ; \
|
| *(out)++ = (((ucs) >> 6) & 0x3f) | 0x80 ; \
|
| *(out)++ = ( (ucs) & 0x3f) | 0x80 ; }
|
| #define WR_5BYTE_UTF8(out, ucs) { *(out)++ = (((ucs) >> 24) & 0x03) | 0xf8 ; \
|
| *(out)++ = (((ucs) >> 18) & 0x3f) | 0x80 ; \
|
| *(out)++ = (((ucs) >> 12) & 0x3f) | 0x80 ; \
|
| *(out)++ = (((ucs) >> 6) & 0x3f) | 0x80 ; \
|
| *(out)++ = ( (ucs) & 0x3f) | 0x80 ; }
|
| #define WR_6BYTE_UTF8(out, ucs) { *(out)++ = (((ucs) >> 30) & 0x01) | 0xfc ; \
|
| *(out)++ = (((ucs) >> 24) & 0x3f) | 0x80 ; \
|
| *(out)++ = (((ucs) >> 18) & 0x3f) | 0x80 ; \
|
| *(out)++ = (((ucs) >> 12) & 0x3f) | 0x80 ; \
|
| *(out)++ = (((ucs) >> 6) & 0x3f) | 0x80 ; \
|
| *(out)++ = ( (ucs) & 0x3f) | 0x80 ; }
|
| /*
|
| * Check if the trailing bytes of a UTF-8 byte sequence is a valid one
|
| * The trailing bytes are valid only if it is of the form 10xxxxxx.
|
| * The macro will return ERR_INVALID_CHAR if an invalid byte is found.
|
| */
|
| #define CHECK_UTF8(in,len) \
|
| { \
|
| if (len > 1) { \
|
| register uchar_t *ptr = in + 1 ; \
|
| register int cnt = len - 1 ; \
|
| while (cnt-- > 0) \
|
| if ((*ptr++ & 0xc0) != 0x80) \
|
| return(ERR_INVALID_CHAR) ; \
|
| } \
|
| }
|
|
|
| #ifdef USING_OPEN_SOURCE_UTF7 /* JAC */
|
| /*
|
| * UTF-7 related macros and constants
|
| *
|
| * UTF-7 is a 7-bit form of UCS Transformation Format. UTF-7 depends on some
|
| * definition of US-ASCII character subsets:
|
| *
|
| * Set D (directly encoded characters, derived from RFC 1521) consists of
|
| * the following characters:
|
| * A-Z, a-z, 0-9, "'(),-./:?"
|
| *
|
| * Set O (optional direct characters) consists of the following 20 characters:
|
| * "!\"#$%&*;<=>@[]^_`{|}"
|
| *
|
| * Set B (Modified Base 64) is the set of characters in the Base64 alphabet
|
| * defined in RFC 1521, excluding the pad character "=".
|
| *
|
| */
|
| #define SHIFT_IN '+'
|
| #define SHIFT_OUT '-'
|
| #define SETD_SIZE (sizeof(direct ) - 1)
|
| #define SETO_SIZE (sizeof(optional) - 1)
|
| #define SETS_SIZE (sizeof(spaces ) - 1)
|
| #define SETB_SIZE (sizeof(base64 ) - 1)
|
|
|
| #define READ_N_BITS(n) ((buffertemp = (table->bitbuffer >> (32-(n)))), \
|
| (table->bitbuffer <<= (n)), \
|
| (table->bits_in_buffer -= (n)), buffertemp)
|
| #define WRITE_N_BITS(x,n) ((table->bitbuffer |= \
|
| (((x) & ~(-1L << (n))) << (32-(n)-table->bits_in_buffer))),\
|
| table->bits_in_buffer += (n))
|
|
|
| static const char base64[] =
|
| "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" ;
|
| static const char direct[] =
|
| "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" ;
|
| static const char optional[] =
|
| "!\"#$%&*;<=>@[]^_`{|}" ;
|
| static const char spaces[] =
|
| " \011\015\012" ; /* Space, tab, return, line feed */
|
| #endif /* USING_OPEN_SOURCE_UTF7 // JAC */
|
|
|
| #ifdef USING_OPEN_SOURCE_UTF7 /* JAC */
|
| /*
|
| * UTF-7 conversion structure
|
| */
|
| typedef struct utf7_conv
|
| {
|
| char mustshift[128] ;
|
| int invbase64[128] ;
|
| int shifted ; /* Set if in shifted state */
|
| int first ; /* Set for first character after SHIFT_IN */
|
| int wroteone ; /* Set if any least one UCS written */
|
| int bits_in_buffer ; /* Number of valid bits in buffer */
|
| int bitbuffer ; /* Buffer for base64 bits */
|
| ucs4_t high_surrogate ; /* Contain high surrogate */
|
| ucs4_t last_ucs4 ; /* Record last converted UCS-4 character */
|
| int last_cnvlen ; /* Record last conversion length */
|
|
|
| } utf7_conv_t ;
|
|
|
| /*
|
| * UTF-7 conversion table initialization routine
|
| */
|
| NA_EIDPROC
|
| static void
|
| utf7_convtable_init(_LC_fcconv_iconv_t *cd)
|
| {
|
| int idx ;
|
| utf7_conv_t *table ;
|
|
|
| if (cd->flags & CONV_UTF7_CONVTABLE)
|
| return ;
|
| #ifdef _KERNEL
|
| MALLOC(table, utf7_conv_t *, sizeof(utf7_conv_t),
|
| M_KERN, M_WAITOK) ;
|
| #else
|
| table = (utf7_conv_t *)malloc(sizeof(utf7_conv_t)) ;
|
| #endif
|
| if (table == NULL) {
|
| #ifndef _KERNEL
|
| perror("Memory allocation fails") ;
|
| exit (1) ;
|
| #else
|
| return ;
|
| #endif
|
|
|
| }
|
|
|
| bzero ((char *)table, sizeof(utf7_conv_t)) ;
|
| #ifndef _KERNEL
|
| memset(table->mustshift, 1, sizeof(table->mustshift)) ;
|
| memset(table->invbase64, -1, sizeof(table->invbase64)) ;
|
| #endif
|
|
|
| for (idx = 0 ; idx < SETD_SIZE ; idx++)
|
| table->mustshift[direct[idx]] = 0 ;
|
|
|
| for (idx = 0 ; idx < SETS_SIZE ; idx++)
|
| table->mustshift[spaces[idx]] = 0 ;
|
|
|
| if (cd->flags & CONV_UTF7_OPTIONAL)
|
| for (idx = 0 ; idx < SETO_SIZE ; idx++)
|
| table->mustshift[optional[idx]] = 0 ;
|
|
|
| for (idx = 0 ; idx < SETB_SIZE ; idx++)
|
| table->invbase64[base64[idx]] = idx ;
|
|
|
| cd->convtable = (caddr_t) table ;
|
| cd->flags |= CONV_UTF7_CONVTABLE ;
|
| }
|
| #endif /* USING_OPEN_SOURCE_UTF7 // JAC */
|
|
|
| /************************************************************************/
|
| /* */
|
| /* UCS-2/UCS-4/UTF-7/UTF-8 input routines */
|
| /* Return: The UCS-4 character or an error code */
|
| /* The in string reference is updated */
|
| /* */
|
| /************************************************************************/
|
| //LCOV_EXCL_START :cnu -- As of 8/30/2011, not used on SQ platform, but may be used on Clients
|
| NA_EIDPROC
|
| int __input_ucs4(_LC_fcconv_iconv_t *cd, uchar_t **in, int len)
|
| {
|
| ucs4_t *inptr = (ucs4_t *)*in ;
|
| int first = !(cd->flags & CONV_INPUT_PROCESSED) ;
|
| WChar_t word ; /*JAC */
|
|
|
| do {
|
| if (len < sizeof(ucs4_t))
|
| return(ERR_INPUT_INCOMPLETE) ;
|
|
|
| word = *inptr++ ;
|
| len -= sizeof(ucs4_t) ;
|
| cd->flags |= CONV_INPUT_PROCESSED ;
|
| if (word == UCS4_BOM) { /* Skip over initial byte order mark */
|
| if (first) {
|
| cd->flags &= ~CONV_REVERSE_INBYTE ;
|
| continue ;
|
| }
|
| }
|
| if (word == UCS4_BOM_REVERSE) {
|
| if (first) {
|
| cd->flags |= CONV_REVERSE_INBYTE ;
|
| continue ;
|
| }
|
| }
|
| break ;
|
| } while (1) ;
|
|
|
| if (cd->flags & CONV_REVERSE_INBYTE)
|
| word = REVERSE_UCS4_BYTE(word) ;
|
| *in = (uchar_t *)inptr ;
|
| return((int)word) ;
|
| }
|
| //LCOV_EXCL_STOP
|
| /*#endif /* USING_OPEN_SOURCE_input_ucs4 // JAC */
|
|
|
| NA_EIDPROC
|
| int __input_ucs2(_LC_fcconv_iconv_t *cd, uchar_t **in, int len)
|
| {
|
| ucs2_t *inptr = (ucs2_t *)*in ;
|
| int first = !(cd->flags & CONV_INPUT_PROCESSED) ;
|
| WChar_t word ; /*JAC */
|
|
|
| do {
|
| if (len < sizeof(ucs2_t))
|
| return(ERR_INPUT_INCOMPLETE) ;
|
|
|
| word = *inptr++ ;
|
| len -= sizeof(ucs2_t) ;
|
| cd->flags |= CONV_INPUT_PROCESSED ;
|
| if (word == UCS2_BOM) { /* Skip over initial byte order mark */
|
| if (first) {
|
| cd->flags &= ~CONV_REVERSE_INBYTE ;
|
| continue ;
|
| }
|
| }
|
| if (word == UCS2_BOM_REVERSE) {
|
| if (first) {
|
| cd->flags |= CONV_REVERSE_INBYTE ;
|
| continue ;
|
| }
|
| }
|
| break ;
|
| } while (1) ;
|
|
|
| if (cd->flags & CONV_REVERSE_INBYTE)
|
| word = REVERSE_UCS2_BYTE(word) ;
|
| /*
|
| * convert surrogates pair (<high-surrogate> <low-surrogate>) to UCS-4
|
| */
|
| if (IS_LSURROGATE(word)) /* Low surrogate without high surrogate */
|
| return(ERR_INVALID_CHAR) ;
|
|
|
| else if (IS_HSURROGATE(word)) {
|
| WChar_t low_surrogate ; /*JAC */
|
|
|
| if (len < sizeof(ucs2_t))
|
| return(ERR_INPUT_INCOMPLETE) ; /* Not enough input */
|
|
|
| low_surrogate = *inptr++ ;
|
| if (cd->flags & CONV_REVERSE_INBYTE)
|
| low_surrogate = REVERSE_UCS2_BYTE(low_surrogate) ;
|
| if (!IS_LSURROGATE(low_surrogate))
|
| return(ERR_INVALID_CHAR) ;
|
|
|
| word = SURROGATES_TO_UCS4(word, low_surrogate) ;
|
| }
|
| *in = (uchar_t *)inptr ;
|
| return((int)word) ;
|
| }
|
|
|
| NA_EIDPROC
|
| int __input_utf8(_LC_fcconv_iconv_t *cd, uchar_t **in, int len)
|
| {
|
| int first = !(cd->flags & CONV_INPUT_PROCESSED) ;
|
| uchar_t *inptr = *in ;
|
| int char_size ;
|
| WChar_t word ; /*JAC */
|
|
|
| word = *inptr ;
|
|
|
| if (IS_1BYTE_UTF8(word))
|
| char_size = 1 ;
|
| else if (IS_2BYTE_UTF8(word))
|
| char_size = 2 ;
|
| else if (IS_3BYTE_UTF8(word))
|
| char_size = 3 ;
|
| else if (IS_4BYTE_UTF8(word))
|
| char_size = 4 ;
|
| //LCOV_EXCL_START : cnu - We don't claim support for 5 or 6 byte long UTF8 chars yet.
|
| else if (IS_5BYTE_UTF8(word))
|
| char_size = 5 ;
|
| else if (IS_6BYTE_UTF8(word))
|
| char_size = 6 ;
|
| //LCOV_EXCL_STOP
|
| else
|
| return(ERR_INVALID_CHAR) ;
|
|
|
| if (len < char_size)
|
| return(ERR_INPUT_INCOMPLETE) ;
|
|
|
| CHECK_UTF8(inptr, char_size) ;
|
| switch (char_size)
|
| {
|
| /*
|
| * Over-long UTF-8 sequences are rejected for better security
|
| */
|
| case 1: word = RD_1BYTE_UTF8(inptr) ; break ;
|
| case 2: word = RD_2BYTE_UTF8(inptr) ;
|
| if (word < MIN_2BYTE_UTF8)
|
| return(ERR_INVALID_CHAR);
|
| break ;
|
| case 3: word = RD_3BYTE_UTF8(inptr) ;
|
| if (word < MIN_3BYTE_UTF8)
|
| return(ERR_INVALID_CHAR);
|
| break ;
|
| case 4: word = RD_4BYTE_UTF8(inptr) ;
|
| if (word < MIN_4BYTE_UTF8)
|
| return(ERR_INVALID_CHAR);
|
| break ;
|
| //LCOV_EXCL_START : cnu - We don't claim support for 5 or 6 byte long UTF8 chars yet.
|
| case 5: word = RD_5BYTE_UTF8(inptr) ;
|
| if (word < MIN_5BYTE_UTF8)
|
| return(ERR_INVALID_CHAR);
|
| break ;
|
| case 6: word = RD_6BYTE_UTF8(inptr) ;
|
| if (word < MIN_6BYTE_UTF8)
|
| return(ERR_INVALID_CHAR);
|
| break ;
|
| //LCOV_EXCL_STOP
|
| }
|
|
|
| inptr += char_size ;
|
| /*
|
| * Properly handle non-BMP characters formed by UTF-8 surrogate pairs
|
| */
|
| if ((char_size == 3) && (IS_HSURROGATE(word)))
|
| {
|
| WChar_t low_surrogate ; /*JAC */
|
|
|
| if (len == 3)
|
| return(ERR_INPUT_INCOMPLETE) ;
|
| if (!IS_3BYTE_UTF8(*inptr))
|
| return(ERR_INVALID_CHAR) ;
|
| low_surrogate = RD_3BYTE_UTF8(inptr) ;
|
| if (!IS_LSURROGATE(low_surrogate))
|
| return(ERR_INVALID_CHAR) ;
|
| word = SURROGATES_TO_UCS4(word, low_surrogate) ;
|
| }
|
| cd->flags |= CONV_INPUT_PROCESSED ;
|
| *in = inptr ;
|
| /*
|
| * Skip BOM if at the beginning of the file
|
| */
|
| if (first && (word == UCS4_BOM))
|
| return(__input_utf8(cd, in, len - char_size)) ;
|
| return((int)word) ;
|
| }
|
|
|
| /*
|
| * Due to the complex nature of UTF-7 to UCS conversion, it is done on a
|
| * character-by-character basis.
|
| */
|
| #ifdef USING_OPEN_SOURCE_UTF7 /* JAC */
|
| NA_EIDPROC
|
| int __input_utf7(_LC_fcconv_iconv_t *cd, uchar_t **in, int len)
|
| {
|
| uchar_t *inptr = *in ;
|
| utf7_conv_t *table ;
|
| ucs4_t ch, word ;
|
| int base64value ;
|
| int buffertemp ;
|
|
|
| if (!(cd->flags & CONV_UTF7_CONVTABLE))
|
| utf7_convtable_init(cd) ;
|
| table = (utf7_conv_t *)cd->convtable ;
|
|
|
| /*
|
| * If inptr is NULL, it indicates the end of conversion and
|
| * hence it is necessary to check the current states of conversion
|
| * for error, e.g. partially converted character.
|
| */
|
| if ((inptr == NULL) || (len <= 0)) {
|
| return(ERR_NOINPUT) ;
|
| }
|
|
|
| while (1)
|
| {
|
| /*
|
| * Incomplete input buffer sequence
|
| */
|
| if (len < 1)
|
| return(ERR_INPUT_INCOMPLETE) ;
|
|
|
| ch = *inptr++ ;
|
| len-- ;
|
|
|
| if (ch > 0x7f)
|
| return(ERR_INPUT_INCOMPLETE) ;
|
|
|
| if (table->shifted) {
|
| base64value = table->invbase64[ch] ;
|
|
|
| if (base64value < 0) {
|
| table->shifted = 0 ;
|
| /*
|
| * If SHIFT_IN is immediately followed by SHIFT_OUT,
|
| * this is a special case for the SHIFT_OUT character.
|
| */
|
| if (table->first && (ch == SHIFT_OUT)) {
|
| word = SHIFT_IN ;
|
| table->first = 0 ;
|
| break ;
|
| }
|
| if (!table->wroteone ||
|
| (READ_N_BITS(table->bits_in_buffer) != 0))
|
| return(ERR_INVALID_CHAR) ;
|
|
|
| if (ch == SHIFT_OUT)
|
| continue ; /* The SHIFT_OUT character is discarded */
|
| /* Fall through to the non-shifted state */
|
| }
|
| else {
|
| /* Add another 6 bits of base64 value to the buffer */
|
| WRITE_N_BITS(base64value, 6) ;
|
| table->first = 0 ;
|
| if (table->bits_in_buffer >= 16) {
|
| word = READ_N_BITS(16) ;
|
| table->wroteone = 1 ;
|
| if (table->high_surrogate) {
|
| if (!IS_LSURROGATE(word))
|
| return(ERR_INVALID_CHAR) ;
|
| word = SURROGATES_TO_UCS4(table->high_surrogate, word) ;
|
| table->high_surrogate = 0 ;
|
| }
|
| else if (IS_HSURROGATE(word)) {
|
| table->high_surrogate = word ;
|
| continue ;
|
| }
|
| break ;
|
| }
|
| continue ; /* Read next byte */
|
| }
|
| }
|
| if (ch == SHIFT_IN) {
|
| table->first = 1 ;
|
| table->shifted = 1 ;
|
| table->wroteone = 0 ;
|
| table->bits_in_buffer = 0 ;
|
| continue ; /* Read next byte */
|
| }
|
| /* It must be a directly encoded character */
|
| word = ch ;
|
| break ;
|
| }
|
|
|
| *in = (uchar_t *)inptr ;
|
| return((int)word) ;
|
| }
|
| #endif /* USING_OPEN_SOURCE_UTF7 // JAC */
|
|
|
| /************************************************************************/
|
| /* */
|
| /* UCS-2/UCS-4/UTF-7/UTF-8 output routines */
|
| /* Return: Number of bytes written to output buffer */
|
| /* */
|
| /************************************************************************/
|
| //LCOV_EXCL_START :cnu -- As of 8/30/2011, not used on SQ platform, but may be used on Clients
|
| NA_EIDPROC
|
| int __output_ucs4(_LC_fcconv_iconv_t *cd, uchar_t *out, int len, ucs4_t word)
|
| {
|
| ucs4_t *outptr = (ucs4_t *)out ;
|
|
|
| /*
|
| * For the special case that len == sizeof(ucs4_t)
|
| * and BOM has not been sent out, don't send out the BOM to avoid
|
| * sending out E2BIG error unnecessarily.
|
| */
|
| if (!(cd->flags & CONV_BOM_WRITTEN)) {
|
| if (len == sizeof(ucs4_t))
|
| cd->flags |= CONV_BOM_WRITTEN ; /* Don't send BOM */
|
| else if (len > sizeof(ucs4_t)) {
|
| *outptr++ = (cd->flags & CONV_REVERSE_OUTBYTE)
|
| ? UCS4_BOM_REVERSE : UCS4_BOM ;
|
| len -= sizeof(ucs4_t) ;
|
| cd->flags |= CONV_BOM_WRITTEN ;
|
| }
|
| }
|
|
|
| if (len < sizeof(ucs4_t))
|
| return(ERR_BUFFER_OVERRUN) ;
|
| if (cd->flags & CONV_REVERSE_OUTBYTE)
|
| word = REVERSE_UCS4_BYTE(word) ;
|
| *outptr++ = word ;
|
| return((uchar_t *)outptr - out) ;
|
| }
|
| //LCOV_EXCL_STOP
|
| /*#endif /* USING_OPEN_SOURCE_output_ucs4 // JAC */
|
|
|
| NA_EIDPROC
|
| int __output_ucs2(_LC_fcconv_iconv_t *cd, uchar_t *out, int len, ucs4_t word)
|
| {
|
| ucs2_t *outptr = (ucs2_t *)out ;
|
|
|
| /*
|
| * For the special case that len = sizeof(ucs2_t)
|
| * and BOM has not been sent out, don't send out the BOM to avoid
|
| * sending out E2BIG error unnecessarily.
|
| */
|
| if (!(cd->flags & CONV_BOM_WRITTEN)) {
|
| if (len == sizeof(ucs2_t))
|
| cd->flags |= CONV_BOM_WRITTEN ; /* Don't send BOM */
|
| else if (len > sizeof(ucs2_t)) {
|
| *outptr++ = (cd->flags & CONV_REVERSE_OUTBYTE)
|
| ? UCS2_BOM_REVERSE : UCS2_BOM ;
|
| len -= sizeof(ucs2_t) ;
|
| cd->flags |= CONV_BOM_WRITTEN ;
|
| }
|
| }
|
|
|
| /*
|
| * Not enough output buffer space?
|
| */
|
| if (len < sizeof(ucs2_t))
|
| return(ERR_BUFFER_OVERRUN) ;
|
|
|
| if (word > MAX_UCS4_VALUE)
|
| return(ERR_INVALID_CHAR) ;
|
|
|
| if (NEED_SURROGATE(word)) {
|
| ucs4_t high_surrogate ;
|
|
|
| if (len < 2 * sizeof(ucs2_t))
|
| return(ERR_BUFFER_OVERRUN) ;
|
|
|
| high_surrogate = GET_HSURROGATE(word) ;
|
| word = GET_LSURROGATE(word) ;
|
| if (cd->flags & CONV_REVERSE_OUTBYTE)
|
| high_surrogate = REVERSE_UCS2_BYTE(high_surrogate) ;
|
| *outptr++ = high_surrogate ;
|
| }
|
| if (cd->flags & CONV_REVERSE_OUTBYTE)
|
| word = REVERSE_UCS2_BYTE(word) ;
|
|
|
| *outptr++ = word ;
|
| return((uchar_t *)outptr - out) ;
|
| }
|
|
|
| NA_EIDPROC
|
| int __output_utf8(_LC_fcconv_iconv_t *cd, uchar_t *out, int len, ucs4_t word)
|
| {
|
| uchar_t *outptr = out ;
|
| int char_size ;
|
|
|
| if (IS_UCS_TO_1B_UTF8(word))
|
| char_size = 1 ;
|
| else if (IS_UCS_TO_2B_UTF8(word))
|
| char_size = 2 ;
|
| else if (IS_UCS_TO_3B_UTF8(word))
|
| char_size = 3 ;
|
| else if (IS_UCS_TO_4B_UTF8(word))
|
| char_size = 4 ;
|
| //LCOV_EXCL_START : cnu - We don't claim support for 5 or 6 byte long UTF8 chars yet.
|
| else if (IS_UCS_TO_5B_UTF8(word))
|
| char_size = 5 ;
|
| else if (IS_UCS_TO_6B_UTF8(word))
|
| char_size = 6 ;
|
| //LCOV_EXCL_STOP
|
| else
|
| return(ERR_INVALID_CHAR) ;
|
|
|
| /*
|
| * Not enough output buffer space?
|
| */
|
| if (len < char_size)
|
| return(ERR_BUFFER_OVERRUN) ;
|
|
|
| switch (char_size)
|
| {
|
| case 1: WR_1BYTE_UTF8(outptr, word) ; break ;
|
| case 2: WR_2BYTE_UTF8(outptr, word) ; break ;
|
| case 3: WR_3BYTE_UTF8(outptr, word) ; break ;
|
| case 4: WR_4BYTE_UTF8(outptr, word) ; break ;
|
| //LCOV_EXCL_START : cnu - We don't claim support for 5 or 6 byte long UTF8 chars yet.
|
| case 5: WR_5BYTE_UTF8(outptr, word) ; break ;
|
| case 6: WR_6BYTE_UTF8(outptr, word) ; break ;
|
| //LCOV_EXCL_STOP
|
| }
|
| return(char_size) ;
|
| }
|
|
|
| #ifdef USING_OPEN_SOURCE_output_utf7 /* JAC */
|
| NA_EIDPROC
|
| int __output_utf7(_LC_fcconv_iconv_t *cd, uchar_t *out, int len, ucs4_t word)
|
| {
|
| uchar_t *outptr = out ;
|
| utf7_conv_t *table ;
|
| uchar_t *orig_outptr ;
|
| int buffertemp ;
|
| int needshift ;
|
| int char_size ;
|
|
|
| if (!(cd->flags & CONV_UTF7_CONVTABLE))
|
| utf7_convtable_init(cd) ;
|
| table = (utf7_conv_t *)cd->convtable ;
|
|
|
| orig_outptr = outptr ;
|
| needshift = (word > 0x7f) || table->mustshift[word] ;
|
|
|
| /*
|
| * Estimate required size of output buffer
|
| */
|
| if (!needshift)
|
| char_size = table->shifted ? 2 : 1 ;
|
| else if (word < 0x10000)
|
| char_size = table->shifted ? 3 : 4 ;
|
| else if (NEED_SURROGATE(word))
|
| char_size = table->shifted ? 6 : 7 ;
|
| else
|
| return(ERR_INVALID_CHAR) ;
|
|
|
| /*
|
| * Not enough output buffer space?
|
| */
|
| if (len < char_size)
|
| return(ERR_BUFFER_OVERRUN) ;
|
|
|
| /*
|
| * Write output bytes
|
| */
|
| if (needshift && !table->shifted) {
|
| *outptr++ = SHIFT_IN ;
|
| /*
|
| * Check for the special case of SHIFT_IN character
|
| */
|
| if (word == SHIFT_IN) {
|
| *outptr++ = SHIFT_OUT ;
|
| return(outptr - out) ;
|
| }
|
| else
|
| table->shifted = 1 ;
|
| }
|
|
|
| if (table->shifted) {
|
| /*
|
| * Either write the character to the bit buffer,
|
| * or pad the bit buffer out to a full base64 character.
|
| */
|
| if (needshift) {
|
| if (NEED_SURROGATE(word)) {
|
| register WChar_t hiword ; /*JAC */
|
|
|
| hiword = GET_HSURROGATE(word) ;
|
| word = GET_LSURROGATE(word) ;
|
| WRITE_N_BITS(hiword, 16) ;
|
| while (table->bits_in_buffer > 6)
|
| *outptr++ = base64[READ_N_BITS(6)] ;
|
| }
|
| WRITE_N_BITS(word, 16) ;
|
| }
|
| else
|
| WRITE_N_BITS(0, (6 - (table->bits_in_buffer%6)) % 6) ;
|
|
|
| /*
|
| * Flush out as many full base64 characters as possible from the
|
| * bit buffer.
|
| */
|
| while (table->bits_in_buffer > 6)
|
| *outptr++ = base64[READ_N_BITS(6)] ;
|
|
|
| if (!needshift) {
|
| /*
|
| * Write the explicit shift out character if
|
| * 1) The caller has requested we always do it, or
|
| * 2) The directly encoded character is in the base64 set.
|
| */
|
| if ((cd->flags & CONV_UTF7_VERBOSE ) ||
|
| (table->invbase64[word] >= 0))
|
| *outptr++ = SHIFT_OUT ;
|
| table->shifted = 0 ;
|
| }
|
| }
|
|
|
| /*
|
| * The character can be directly encoded as ASCII
|
| */
|
| if (!needshift) {
|
| /*
|
| * Check buffer space again as the byte size estimate
|
| * may not be correct.
|
| */
|
| if (len < 1)
|
| return(ERR_BUFFER_OVERRUN) ;
|
| *outptr++ = word ;
|
| }
|
|
|
| return(outptr - out) ;
|
| }
|
| #endif /* USING_OPEN_SOURCE_output_utf7 // JAC */
|