blob: 8b02627d71c13a2d98dc11df2ac74745be32ae7f [file] [log] [blame]
/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
/*
* HISTORY
* $Log: from_GB18030.c,v $
* Revision 1.1.8.1 2001/12/07 15:37:38
* * Remove __UDC_to_gbk.
* * Add UCS plane 1-16 character mapping support.
*
* Revision 1.1.4.3 2000/12/12 14:16:18
* Add UDC checking code in GB18030 to UCS conversion routine.
*
* Revision 1.1.4.2 2000/12/11 19:41:16
* Incorporate update due to the new GB18030 to UCS conversion table.
*
* Revision 1.1.4.1 2000/10/16 18:44:45
* COSIX.Zulu to Yankee merge for GB18030 support.
*
* Revision 1.1.2.1 2000/08/07 14:33:44
* Add support for Unicode to GB18030-2000 conversion.
* [2000/08/01 17:40:14 Waiman_Long]
*
* $EndLog$
*/
#include "fcconv.h"
#include "multi-byte.h"
/*
* Valid 4-byte GB18030 - 1st byte: 0x81-0xfe
* - 2nd byte: 0x30-0x39
* - 3rd byte: 0x81-0xfe
* - 4th byte: 0x30-0x39
*
* Macro to compute the index of the given 4-byte GB18030 character.
*/
#define GB18030_4NROWS (0xfe - 0x81 + 1)
#define GB18030_4NCOLS (0x39 - 0x30 + 1)
#define GB18030_4SIZE_4 GB18030_4NCOLS
#define GB18030_4SIZE_3 (GB18030_4NROWS * GB18030_4SIZE_4)
#define GB18030_4SIZE_2 (GB18030_4NCOLS * GB18030_4SIZE_3)
#define GB18030_4IDX(c1,c2,c3,c4) \
(((c1) - 0x81) * GB18030_4SIZE_2 + \
((c2) - 0x30) * GB18030_4SIZE_3 + \
((c3) - 0x81) * GB18030_4SIZE_4 + \
((c4) - 0x30))
#define GB18030_GBIDX(c) GB18030_4IDX((c >> 24) & 0xff, \
(c >> 16) & 0xff, \
(c >> 8) & 0xff, \
c & 0xff)
#define GBIDX_GB18030(idx, gb) \
{ \
(gb ) = (int )(idx) % GB18030_4NCOLS + 0x30 ; \
(idx) /= GB18030_4NCOLS ; \
(gb ) |= ((int )(idx) % GB18030_4NROWS + 0x81) << 8 ; \
(idx) /= GB18030_4NROWS ; \
(gb ) |= ((int )(idx) % GB18030_4NCOLS + 0x30) << 16 ; \
(gb ) |= ((int )(idx) / GB18030_4NCOLS + 0x81) << 24 ; \
}
/*
* 4-byte UDC checking macros (0x8336d030-0x84308130 -> U+E865-U+F8FF)
*/
#define GB4_UIDXLO GB18030_4IDX(0x83, 0x36, 0xd0, 0x30)
#define GB4_UIDXHI GB18030_4IDX(0x84, 0x30, 0x81, 0x30)
#define GB4_UDCLO 0xE865
#define GB4_UDCHI 0xF8FF
#define GB18030_UIDX4(idx) ((GB4_UIDXLO <= (idx)) && ((idx) <= GB4_UIDXHI))
#define GB18030_IDX4UDC4(idx) (idx - GB4_UIDXLO + GB4_UDCLO )
#define GB18030_UDC4IDX4(udc) (udc - GB4_UDCLO + GB4_UIDXLO)
#define GB18030_4IDX_MAX GB18030_4IDX(0x84, 0x31, 0xa4, 0x37)
/*
* UDC range U+E766-U+E864 will be mapped by table lookup
*/
#define GB18030_TUDCLO 0xE766
#define GB18030_TUDCHI 0xE864
#define GB18030_TUDCLEN (GB18030_TUDCHI - GB18030_TUDCLO + 1)
#define GB18030_TUDC(udc) ((GB18030_TUDCLO <= (udc)) && ((udc) <= GB18030_TUDCHI))
/*
* UCS plane 1-16 mapping
* 0x90308130-0xe339fe39 (1058400 codepoints) -> Plane 1-16
*/
#define GB18030E_4IDX_MIN GB18030_4IDX(0x90, 0x30, 0x81, 0x30)
#define GB18030E_4IDX_MAX GB18030_4IDX(0Xe3, 0x39, 0xfe, 0x39)
#define IS_4IDXE(idx) ((GB18030E_4IDX_MIN <= (idx)) && \
((idx) <= GB18030E_4IDX_MAX))
#define UCS_TO_4IDXE(ucs) ((ucs) - 0x10000 + GB18030E_4IDX_MIN)
#define UCS_FR_4IDXE(idx) ((idx) + 0x10000 - GB18030E_4IDX_MIN)
/*
* Miscellaneous macros
*/
#define ARRSIZE(arr) (sizeof(arr)/sizeof(arr[0]))
#define SFUNC_CAST (int (*)(const void *, const void *))
/*
* Assuming that 4-byte GB18030 code is rare. We can use a slower but
* more memory efficient method of mapping those 4-byte codes to Unicode
* and vice versa.
*
* The gbidx_ucs_t structure is used to map 4-byte GB18030 index to the
* corresponding UCS value. For all index value >= gbidx and < gbidx of
* the next entry, the offset value is added to produce the final UCS
* result.
*
* The ucs_gbidx_t structure is for mapping UCS value back to 4-byte
* GB18030 index.
*/
typedef struct
{
int gbidx ; /* 4-byte GB18030 index of first one in a range */
int offset ; /* Offset that adds to index to produce UCS value */
} gb4idx_ucs_t ;
typedef struct
{
int lo_ucs ; /* Low limit of UCS range */
int hi_ucs ; /* High limit of UCS range */
int offset ; /* Offset to be added to produce index value */
} ucs_gb4idx_t ;
#ifndef NODATA
#include "gb18030_data.c"
#else
static const ushort_t udc_gb_table [] = { 0 } ;
static const gb4idx_ucs_t gb4idx_ucs_table[] = { 0, 0 } ;
static const ucs_gb4idx_t ucs_gb4idx_table[] = { 0, 0, 0 } ;
#endif
static int
compare_gb4idx_ucs(gb4idx_ucs_t *ptr1, gb4idx_ucs_t *ptr2)
{
int factor = 1 ;
gb4idx_ucs_t *ptr ;
if (ptr2->offset == INT_MAX)
{
/*
* Swap ptr1 and ptr2 & set factor to -1
*/
ptr = ptr1 ;
ptr1 = ptr2 ;
ptr2 = ptr ;
factor = -1 ;
}
if (ptr1->gbidx < ptr2->gbidx)
return(-1 * factor) ;
if ((ptr1->gbidx >= ptr2->gbidx) && (ptr1->gbidx < ptr2[1].gbidx))
return(0) ; /* A match */
else
return(factor) ;
}
static int
compare_gb4idx_ucs_C(const void *ptr1, const void *ptr2) //JAC - CAST FUNCTION needed to make our NT compiler happy
{
return compare_gb4idx_ucs( (gb4idx_ucs_t *) ptr1, (gb4idx_ucs_t *) ptr2 );
}
static int
compare_ucs_gb4idx(ucs_gb4idx_t *ptr1, ucs_gb4idx_t *ptr2)
{
int factor = 1 ;
ucs_gb4idx_t *ptr ;
if (ptr2->offset == INT_MAX)
{
/*
* Swap ptr1 and ptr2 & set factor to -1
*/
ptr = ptr1 ;
ptr1 = ptr2 ;
ptr2 = ptr ;
factor = -1 ;
}
if (ptr1->lo_ucs < ptr2->lo_ucs)
return(-1 * factor) ;
if (ptr1->lo_ucs > ptr2->hi_ucs)
return(factor) ;
return(0) ; /* A match */
}
static int
compare_ucs_gb4idx_C(const void *ptr1, const void *ptr2) //JAC - CAST FUNCTION needed to make our NT compiler happy
{
return compare_ucs_gb4idx( (ucs_gb4idx_t *) ptr1, (ucs_gb4idx_t *) ptr2 );
}
/*
* This routine maps a UCS UDC value in the table mapping range back to
* a 2-byte GB18030 code if the value >= 0xa1a1. Otherwise, it is assumed
* to be a 4-byte index and converted to a 4-byte GB18030 code.
*/
static uint_t
tudc_to_gb(int udc)
{
uint_t gb = udc_gb_table[udc - GB18030_TUDCLO] ;
if (gb < 0xa1a1)
{
int idx = gb ;
GBIDX_GB18030(idx, gb) ; /* UDC area 4 */
}
return(gb) ;
}
/*
* This routine converts one GB18030 character from the input stream to table
* index or directly to Unicode value.
*
* Return Value:
* Table index if no error
* -1 - Invalid sequence (EILSEQ)
* -2 - Input incomplete
*/
int
__gb18030_index(_LC_fcconv_iconv_t * cd, uchar_t **in, int len)
{
uint ch1, ch2 ;
uint ch3, ch4 ;
uint idx ;
uchar *ip ;
ip = *in ;
if (GB18030_1BYTE(*ip))
{
ch1 = *ip++ ;
*in = ip ;
return(ch1) ;
}
if (len < 2)
return(ERR_INPUT_INCOMPLETE) ;
ch1 = *ip++ ;
ch2 = *ip++ ;
if (GB18030_2BYTE(ch1, ch2))
{
*in = ip ; /* Adjust input pointer */
if (GB18030_PLANE1(ch1, ch2))
{
if (GB18030_UDC11(ch1, ch2) || GB18030_UDC12(ch1, ch2))
return(GB18030_IDXU1(ch1, ch2)) ;
return(GB18030_IDX1(ch1, ch2)) ;
}
else if (GB18030_PLANE2(ch1, ch2))
{
return(GB18030_IDX2(ch1, ch2)) ;
}
else /* GB18030_PLANE3(ch1, ch2) */
{
if (GB18030_UDC3(ch1, ch2))
return(GB18030_IDXU3(ch1, ch2)) ;
return(GB18030_IDX3(ch1, ch2)) ;
}
}
if (GB18030_4BYTE(ch1, ch2))
{
gb4idx_ucs_t key ;
gb4idx_ucs_t *result ;
if (len < 4)
return(ERR_INPUT_INCOMPLETE) ;
ch3 = *ip++ ;
ch4 = *ip++ ;
if (!GB18030_4BYTE(ch3, ch4))
return(ERR_INVALID_CHAR) ;
if ((idx = GB18030_4IDX(ch1, ch2, ch3, ch4)) > GB18030_4IDX_MAX)
{
if (IS_4IDXE(idx)) {
*in = ip ; /* Adjust input pointer */
return(UCS_FR_4IDXE(idx) | UCODE_MASK) ;
}
return(ERR_INVALID_CHAR) ; /* Invalid character */
}
/*
* Search the mapping table for a match entry
*/
key.gbidx = idx ;
key.offset = INT_MAX ;
result = (gb4idx_ucs_t *)bsearch(&key, gb4idx_ucs_table, ARRSIZE(gb4idx_ucs_table), //JAC
sizeof(gb4idx_ucs_table[0]),
compare_gb4idx_ucs_C) ; //JAC
*in = ip ; /* Adjust input pointer */
/* Return plain Unicode value */
return((idx + result->offset) | UCODE_MASK) ;
}
return(ERR_INVALID_CHAR) ;
}
/*
* This routine maps UDC characters in Unicode to those in GB18030
*/
WChar_t // JAC
__UDC_to_gb18030(ucs4_t ucs)
{
int uidx = UCS_UIDX(ucs) ;
int gb4 ;
int gb4idx ;
if (ucs < GB18030_TUDCLO)
return(UIDX_GB18030(uidx)) ; /* UDC areas 1, 2, 3 */
if (ucs <= GB18030_TUDCHI)
return(tudc_to_gb(ucs)) ; /* Table lookup UDC */
gb4idx = (ucs <= UCS_UDC_END) ? GB18030_UDC4IDX4(ucs)
: UCS_TO_4IDXE (ucs) ;
GBIDX_GB18030(gb4idx, gb4) ; /* UDC area 4 */
return((WChar_t)gb4) ; // JAC
}
/*
* This routine maps UCS characters to 4-byte GB18030 characters
*/
WChar_t // JAC
__UCS_to_gb18030(ucs4_t ucs)
{
ucs_gb4idx_t key, *result ;
int gb4idx ;
int gb4 ;
if (ucs < 0x10000)
{
key.hi_ucs = key.lo_ucs = ucs ;
key.offset = INT_MAX ;
if ((result = (ucs_gb4idx_t *)bsearch(&key, ucs_gb4idx_table, ARRSIZE(ucs_gb4idx_table), //JAC
sizeof(ucs_gb4idx_table[0]),
compare_ucs_gb4idx_C)) == NULL) //JAC
return(BAD) ;
gb4idx = result->offset + ucs;
}
else if (ucs < 0x110000)
gb4idx = UCS_TO_4IDXE(ucs) ; /* Map plane 1-16 UCS character */
else
return(BAD) ;
GBIDX_GB18030(gb4idx, gb4) ;
return((WChar_t)gb4) ; // JAC
}