| /**********************************************************************
|
| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| **********************************************************************/
|
| /*
|
| * HISTORY
|
| * $Log: from_GB18030.c,v $
|
| * Revision 1.1.8.1 2001/12/07 15:37:38
|
| * * Remove __UDC_to_gbk.
|
| * * Add UCS plane 1-16 character mapping support.
|
| *
|
| * Revision 1.1.4.3 2000/12/12 14:16:18
|
| * Add UDC checking code in GB18030 to UCS conversion routine.
|
| *
|
| * Revision 1.1.4.2 2000/12/11 19:41:16
|
| * Incorporate update due to the new GB18030 to UCS conversion table.
|
| *
|
| * Revision 1.1.4.1 2000/10/16 18:44:45
|
| * COSIX.Zulu to Yankee merge for GB18030 support.
|
| *
|
| * Revision 1.1.2.1 2000/08/07 14:33:44
|
| * Add support for Unicode to GB18030-2000 conversion.
|
| * [2000/08/01 17:40:14 Waiman_Long]
|
| *
|
| * $EndLog$
|
| */
|
|
|
| #include "fcconv.h"
|
| #include "multi-byte.h"
|
|
|
| /*
|
| * Valid 4-byte GB18030 - 1st byte: 0x81-0xfe
|
| * - 2nd byte: 0x30-0x39
|
| * - 3rd byte: 0x81-0xfe
|
| * - 4th byte: 0x30-0x39
|
| *
|
| * Macro to compute the index of the given 4-byte GB18030 character.
|
| */
|
| #define GB18030_4NROWS (0xfe - 0x81 + 1)
|
| #define GB18030_4NCOLS (0x39 - 0x30 + 1)
|
| #define GB18030_4SIZE_4 GB18030_4NCOLS
|
| #define GB18030_4SIZE_3 (GB18030_4NROWS * GB18030_4SIZE_4)
|
| #define GB18030_4SIZE_2 (GB18030_4NCOLS * GB18030_4SIZE_3)
|
| #define GB18030_4IDX(c1,c2,c3,c4) \
|
| (((c1) - 0x81) * GB18030_4SIZE_2 + \
|
| ((c2) - 0x30) * GB18030_4SIZE_3 + \
|
| ((c3) - 0x81) * GB18030_4SIZE_4 + \
|
| ((c4) - 0x30))
|
| #define GB18030_GBIDX(c) GB18030_4IDX((c >> 24) & 0xff, \
|
| (c >> 16) & 0xff, \
|
| (c >> 8) & 0xff, \
|
| c & 0xff)
|
| #define GBIDX_GB18030(idx, gb) \
|
| { \
|
| (gb ) = (int )(idx) % GB18030_4NCOLS + 0x30 ; \
|
| (idx) /= GB18030_4NCOLS ; \
|
| (gb ) |= ((int )(idx) % GB18030_4NROWS + 0x81) << 8 ; \
|
| (idx) /= GB18030_4NROWS ; \
|
| (gb ) |= ((int )(idx) % GB18030_4NCOLS + 0x30) << 16 ; \
|
| (gb ) |= ((int )(idx) / GB18030_4NCOLS + 0x81) << 24 ; \
|
| }
|
|
|
| /*
|
| * 4-byte UDC checking macros (0x8336d030-0x84308130 -> U+E865-U+F8FF)
|
| */
|
| #define GB4_UIDXLO GB18030_4IDX(0x83, 0x36, 0xd0, 0x30)
|
| #define GB4_UIDXHI GB18030_4IDX(0x84, 0x30, 0x81, 0x30)
|
| #define GB4_UDCLO 0xE865
|
| #define GB4_UDCHI 0xF8FF
|
| #define GB18030_UIDX4(idx) ((GB4_UIDXLO <= (idx)) && ((idx) <= GB4_UIDXHI))
|
| #define GB18030_IDX4UDC4(idx) (idx - GB4_UIDXLO + GB4_UDCLO )
|
| #define GB18030_UDC4IDX4(udc) (udc - GB4_UDCLO + GB4_UIDXLO)
|
| #define GB18030_4IDX_MAX GB18030_4IDX(0x84, 0x31, 0xa4, 0x37)
|
|
|
| /*
|
| * UDC range U+E766-U+E864 will be mapped by table lookup
|
| */
|
| #define GB18030_TUDCLO 0xE766
|
| #define GB18030_TUDCHI 0xE864
|
| #define GB18030_TUDCLEN (GB18030_TUDCHI - GB18030_TUDCLO + 1)
|
| #define GB18030_TUDC(udc) ((GB18030_TUDCLO <= (udc)) && ((udc) <= GB18030_TUDCHI))
|
|
|
| /*
|
| * UCS plane 1-16 mapping
|
| * 0x90308130-0xe339fe39 (1058400 codepoints) -> Plane 1-16
|
| */
|
| #define GB18030E_4IDX_MIN GB18030_4IDX(0x90, 0x30, 0x81, 0x30)
|
| #define GB18030E_4IDX_MAX GB18030_4IDX(0Xe3, 0x39, 0xfe, 0x39)
|
| #define IS_4IDXE(idx) ((GB18030E_4IDX_MIN <= (idx)) && \
|
| ((idx) <= GB18030E_4IDX_MAX))
|
| #define UCS_TO_4IDXE(ucs) ((ucs) - 0x10000 + GB18030E_4IDX_MIN)
|
| #define UCS_FR_4IDXE(idx) ((idx) + 0x10000 - GB18030E_4IDX_MIN)
|
|
|
| /*
|
| * Miscellaneous macros
|
| */
|
| #define ARRSIZE(arr) (sizeof(arr)/sizeof(arr[0]))
|
| #define SFUNC_CAST (int (*)(const void *, const void *))
|
|
|
| /*
|
| * Assuming that 4-byte GB18030 code is rare. We can use a slower but
|
| * more memory efficient method of mapping those 4-byte codes to Unicode
|
| * and vice versa.
|
| *
|
| * The gbidx_ucs_t structure is used to map 4-byte GB18030 index to the
|
| * corresponding UCS value. For all index value >= gbidx and < gbidx of
|
| * the next entry, the offset value is added to produce the final UCS
|
| * result.
|
| *
|
| * The ucs_gbidx_t structure is for mapping UCS value back to 4-byte
|
| * GB18030 index.
|
| */
|
| typedef struct
|
| {
|
| int gbidx ; /* 4-byte GB18030 index of first one in a range */
|
| int offset ; /* Offset that adds to index to produce UCS value */
|
| } gb4idx_ucs_t ;
|
|
|
| typedef struct
|
| {
|
| int lo_ucs ; /* Low limit of UCS range */
|
| int hi_ucs ; /* High limit of UCS range */
|
| int offset ; /* Offset to be added to produce index value */
|
| } ucs_gb4idx_t ;
|
|
|
| #ifndef NODATA
|
| #include "gb18030_data.c"
|
| #else
|
| static const ushort_t udc_gb_table [] = { 0 } ;
|
| static const gb4idx_ucs_t gb4idx_ucs_table[] = { 0, 0 } ;
|
| static const ucs_gb4idx_t ucs_gb4idx_table[] = { 0, 0, 0 } ;
|
| #endif
|
|
|
| NA_EIDPROC
|
| static int
|
| compare_gb4idx_ucs(gb4idx_ucs_t *ptr1, gb4idx_ucs_t *ptr2)
|
| {
|
| int factor = 1 ;
|
| gb4idx_ucs_t *ptr ;
|
|
|
| if (ptr2->offset == INT_MAX)
|
| {
|
| /*
|
| * Swap ptr1 and ptr2 & set factor to -1
|
| */
|
| ptr = ptr1 ;
|
| ptr1 = ptr2 ;
|
| ptr2 = ptr ;
|
| factor = -1 ;
|
| }
|
| if (ptr1->gbidx < ptr2->gbidx)
|
| return(-1 * factor) ;
|
| if ((ptr1->gbidx >= ptr2->gbidx) && (ptr1->gbidx < ptr2[1].gbidx))
|
| return(0) ; /* A match */
|
| else
|
| return(factor) ;
|
| }
|
|
|
| NA_EIDPROC
|
| static int
|
| compare_gb4idx_ucs_C(const void *ptr1, const void *ptr2) //JAC - CAST FUNCTION needed to make our NT compiler happy
|
| {
|
| return compare_gb4idx_ucs( (gb4idx_ucs_t *) ptr1, (gb4idx_ucs_t *) ptr2 );
|
| }
|
|
|
| NA_EIDPROC
|
| static int
|
| compare_ucs_gb4idx(ucs_gb4idx_t *ptr1, ucs_gb4idx_t *ptr2)
|
| {
|
| int factor = 1 ;
|
| ucs_gb4idx_t *ptr ;
|
|
|
| if (ptr2->offset == INT_MAX)
|
| {
|
| /*
|
| * Swap ptr1 and ptr2 & set factor to -1
|
| */
|
| ptr = ptr1 ;
|
| ptr1 = ptr2 ;
|
| ptr2 = ptr ;
|
| factor = -1 ;
|
| }
|
| if (ptr1->lo_ucs < ptr2->lo_ucs)
|
| return(-1 * factor) ;
|
| if (ptr1->lo_ucs > ptr2->hi_ucs)
|
| return(factor) ;
|
| return(0) ; /* A match */
|
| }
|
|
|
| NA_EIDPROC
|
| static int
|
| compare_ucs_gb4idx_C(const void *ptr1, const void *ptr2) //JAC - CAST FUNCTION needed to make our NT compiler happy
|
| {
|
| return compare_ucs_gb4idx( (ucs_gb4idx_t *) ptr1, (ucs_gb4idx_t *) ptr2 );
|
| }
|
|
|
|
|
| /*
|
| * This routine maps a UCS UDC value in the table mapping range back to
|
| * a 2-byte GB18030 code if the value >= 0xa1a1. Otherwise, it is assumed
|
| * to be a 4-byte index and converted to a 4-byte GB18030 code.
|
| */
|
| NA_EIDPROC
|
| static uint_t
|
| tudc_to_gb(int udc)
|
| {
|
| uint_t gb = udc_gb_table[udc - GB18030_TUDCLO] ;
|
| if (gb < 0xa1a1)
|
| {
|
| int idx = gb ;
|
| GBIDX_GB18030(idx, gb) ; /* UDC area 4 */
|
| }
|
| return(gb) ;
|
| }
|
|
|
| /*
|
| * This routine converts one GB18030 character from the input stream to table
|
| * index or directly to Unicode value.
|
| *
|
| * Return Value:
|
| * Table index if no error
|
| * -1 - Invalid sequence (EILSEQ)
|
| * -2 - Input incomplete
|
| */
|
| NA_EIDPROC
|
| int
|
| __gb18030_index(_LC_fcconv_iconv_t * cd, uchar_t **in, int len)
|
| {
|
| uint ch1, ch2 ;
|
| uint ch3, ch4 ;
|
| uint idx ;
|
| uchar *ip ;
|
|
|
| ip = *in ;
|
| if (GB18030_1BYTE(*ip))
|
| {
|
| ch1 = *ip++ ;
|
| *in = ip ;
|
| return(ch1) ;
|
| }
|
|
|
| if (len < 2)
|
| return(ERR_INPUT_INCOMPLETE) ;
|
|
|
| ch1 = *ip++ ;
|
| ch2 = *ip++ ;
|
|
|
| if (GB18030_2BYTE(ch1, ch2))
|
| {
|
| *in = ip ; /* Adjust input pointer */
|
| if (GB18030_PLANE1(ch1, ch2))
|
| {
|
| if (GB18030_UDC11(ch1, ch2) || GB18030_UDC12(ch1, ch2))
|
| return(GB18030_IDXU1(ch1, ch2)) ;
|
| return(GB18030_IDX1(ch1, ch2)) ;
|
| }
|
| else if (GB18030_PLANE2(ch1, ch2))
|
| {
|
| return(GB18030_IDX2(ch1, ch2)) ;
|
| }
|
| else /* GB18030_PLANE3(ch1, ch2) */
|
| {
|
| if (GB18030_UDC3(ch1, ch2))
|
| return(GB18030_IDXU3(ch1, ch2)) ;
|
| return(GB18030_IDX3(ch1, ch2)) ;
|
| }
|
| }
|
| if (GB18030_4BYTE(ch1, ch2))
|
| {
|
| gb4idx_ucs_t key ;
|
| gb4idx_ucs_t *result ;
|
|
|
| if (len < 4)
|
| return(ERR_INPUT_INCOMPLETE) ;
|
| ch3 = *ip++ ;
|
| ch4 = *ip++ ;
|
| if (!GB18030_4BYTE(ch3, ch4))
|
| return(ERR_INVALID_CHAR) ;
|
| if ((idx = GB18030_4IDX(ch1, ch2, ch3, ch4)) > GB18030_4IDX_MAX)
|
| {
|
| if (IS_4IDXE(idx)) {
|
| *in = ip ; /* Adjust input pointer */
|
| return(UCS_FR_4IDXE(idx) | UCODE_MASK) ;
|
| }
|
| return(ERR_INVALID_CHAR) ; /* Invalid character */
|
| }
|
| /*
|
| * Search the mapping table for a match entry
|
| */
|
| key.gbidx = idx ;
|
| key.offset = INT_MAX ;
|
| result = (gb4idx_ucs_t *)bsearch(&key, gb4idx_ucs_table, ARRSIZE(gb4idx_ucs_table), //JAC
|
| sizeof(gb4idx_ucs_table[0]),
|
| compare_gb4idx_ucs_C) ; //JAC
|
| *in = ip ; /* Adjust input pointer */
|
| /* Return plain Unicode value */
|
| return((idx + result->offset) | UCODE_MASK) ;
|
| }
|
| return(ERR_INVALID_CHAR) ;
|
| }
|
|
|
| /*
|
| * This routine maps UDC characters in Unicode to those in GB18030
|
| */
|
| NA_EIDPROC
|
| WChar_t // JAC
|
| __UDC_to_gb18030(ucs4_t ucs)
|
| {
|
| int uidx = UCS_UIDX(ucs) ;
|
| int gb4 ;
|
| int gb4idx ;
|
|
|
| if (ucs < GB18030_TUDCLO)
|
| return(UIDX_GB18030(uidx)) ; /* UDC areas 1, 2, 3 */
|
| if (ucs <= GB18030_TUDCHI)
|
| return(tudc_to_gb(ucs)) ; /* Table lookup UDC */
|
| gb4idx = (ucs <= UCS_UDC_END) ? GB18030_UDC4IDX4(ucs)
|
| : UCS_TO_4IDXE (ucs) ;
|
| GBIDX_GB18030(gb4idx, gb4) ; /* UDC area 4 */
|
| return((WChar_t)gb4) ; // JAC
|
| }
|
|
|
| /*
|
| * This routine maps UCS characters to 4-byte GB18030 characters
|
| */
|
| NA_EIDPROC
|
| WChar_t // JAC
|
| __UCS_to_gb18030(ucs4_t ucs)
|
| {
|
| ucs_gb4idx_t key, *result ;
|
| int gb4idx ;
|
| int gb4 ;
|
|
|
| if (ucs < 0x10000)
|
| {
|
| key.hi_ucs = key.lo_ucs = ucs ;
|
| key.offset = INT_MAX ;
|
| if ((result = (ucs_gb4idx_t *)bsearch(&key, ucs_gb4idx_table, ARRSIZE(ucs_gb4idx_table), //JAC
|
| sizeof(ucs_gb4idx_table[0]),
|
| compare_ucs_gb4idx_C)) == NULL) //JAC
|
| return(BAD) ;
|
| gb4idx = result->offset + ucs;
|
| }
|
| else if (ucs < 0x110000)
|
| gb4idx = UCS_TO_4IDXE(ucs) ; /* Map plane 1-16 UCS character */
|
| else
|
| return(BAD) ;
|
| GBIDX_GB18030(gb4idx, gb4) ;
|
| return((WChar_t)gb4) ; // JAC
|
| }
|