| /********************************************************************** |
| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| **********************************************************************/ |
| |
| /* -*-C++-*- |
| ***************************************************************************** |
| * |
| * File: unicode_char_set.cpp |
| * RCS: $Id: |
| * Description: The implementation of unicode_char_set class |
| * |
| * |
| * Created: 7/8/98 |
| * Modified: $ $Date: 1998/08/10 16:01:12 $ (GMT) |
| * Language: C++ |
| * Status: $State: Exp $ |
| * |
| * |
| * |
| * |
| ***************************************************************************** |
| */ |
| |
| #include "unicode_char_set.h" |
| #include "BaseTypes.h" |
| |
| #include "nawstring.h" |
| |
| // 4/10/98 Unicode char/string manipulations |
| |
| typedef struct unicode_mapping |
| { |
| unsigned short code1; |
| unsigned short code2; |
| } unicode_mapping_t; |
| |
| typedef struct unicode_mapping_full |
| { |
| unsigned short code1; |
| unsigned short code2[3]; |
| } unicode_mapping_full_t; |
| |
| // the following three included .h files are generated using |
| // the script MiscVOB/i18n/CaseMapping.pl |
| static const unicode_mapping_full_t unicode_lower2upper_mapping_table_full[] = |
| { |
| #include "1n_lt2u.h" |
| }; |
| |
| static const unicode_mapping_t unicode_lower2upper_mapping_table[] = |
| { |
| #include "11_lt2u.h" |
| }; |
| |
| |
| static const unicode_mapping_t unicode_upper2lower_mapping_table[] = |
| { |
| #include "11_ut2l.h" |
| }; |
| |
| |
| NAWchar |
| binary_search(NAWchar wc, Int32 lower, Int32 upper, |
| unicode_mapping_t table[]) |
| { |
| while ( lower <= upper ) { |
| Int32 middle = (lower+upper) >> 1; |
| if ( table[middle].code1 == wc ) |
| return table[middle].code2; |
| |
| if ( table[middle].code1 < wc ) |
| lower = middle + 1; |
| else |
| upper = middle - 1; |
| } |
| return wc; |
| } |
| |
| // search the lower to upper full mapping table |
| NAWchar* |
| binary_search(NAWchar wc, Int32 lower, Int32 upper, |
| unicode_mapping_full_t table[]) |
| { |
| while ( lower <= upper ) { |
| Int32 middle = (lower+upper) >> 1; |
| if ( table[middle].code1 == wc ) |
| return (NAWchar *)table[middle].code2; |
| |
| if ( table[middle].code1 < wc ) |
| lower = middle + 1; |
| else |
| upper = middle - 1; |
| } |
| return NULL; |
| } |
| |
| // |
| // 4/10/98 compute the Unicode upperShift function |
| // |
| NAWchar unicode_char_set::to_upper(const NAWchar x) |
| { |
| if ( IN_RANGE(x, 0x61, 0x7a) ) { // frequently used chars checked first |
| return x - 0x61 + 0x41; |
| } |
| |
| if ( IN_RANGE(x, 0xe0, 0xf6) ) { // frequently used chars checked first |
| return x - 0xe0 + 0xc0; |
| } |
| |
| return binary_search(x, 0, |
| sizeof(unicode_lower2upper_mapping_table)/sizeof(unicode_mapping_t)-1, |
| (unicode_mapping_t*)unicode_lower2upper_mapping_table |
| ); |
| } |
| |
| // full case mapping |
| NAWchar* unicode_char_set::to_upper_full(const NAWchar x) |
| { |
| return binary_search(x, 0, |
| sizeof(unicode_lower2upper_mapping_table_full)/sizeof(unicode_mapping_full_t)-1, |
| (unicode_mapping_full_t*)unicode_lower2upper_mapping_table_full |
| ); |
| } |
| |
| // This method works the same way as ex_function_upper_unicode::eval() |
| // in exp/exp_function_upper_unicode.cpp. This method is being called |
| // by ConstValue::toUpper() while applying upper method on constants. |
| // The 'len' number of chars in 'str' are upshifted and the result |
| // is kept in 'upStr'. |
| void |
| unicode_char_set::to_upper(NAWchar *str, size_t len, NAWString &upStr) |
| { |
| NAWchar* tmpWCP = NULL; |
| |
| for(size_t i = 0; i < len; ++i) { |
| |
| // search against unicode_lower2upper_mapping_table_full |
| tmpWCP = unicode_char_set::to_upper_full(str[i]); |
| |
| if (tmpWCP) { |
| upStr += tmpWCP[0]; |
| upStr += tmpWCP[1]; |
| |
| if (tmpWCP[2] != (NAWchar)0) { |
| upStr += tmpWCP[2]; |
| } |
| } else { |
| // a NULL return from to_upper_full() |
| // search against unicode_lower2upper_mapping_table then |
| upStr += unicode_char_set::to_upper(str[i]); |
| } |
| } |
| } |
| |
| /* |
| long unicode_char_set::to_upper(NAWchar *str, long srcLen, |
| NAWchar *upStr, long maxTgtLen) |
| { |
| NAWchar *tmpWCP = NULL; |
| long tgtLen = 0, i = 0; |
| |
| for(i = 0; i < srcLen; ++i) |
| { |
| // search against unicode_lower2upper_mapping_table_full |
| tmpWCP = unicode_char_set::to_upper_full(str[i]); |
| |
| if (tmpWCP) |
| { |
| if(tgtLen >= maxTgtLen - 1) return -1; |
| |
| upStr[tgtLen++] = tmpWCP[0]; |
| upStr[tgtLen++] = tmpWCP[1]; |
| |
| if (tmpWCP[2] != (NAWchar)0) |
| { |
| if(tgtLen >= maxTgtLen) return -1; |
| upStr[tgtLen++] = tmpWCP[2]; |
| } |
| } else { |
| // a NULL return from to_upper_full() |
| // search against unicode_lower2upper_mapping_table then |
| if(tgtLen >= maxTgtLen) return -1; |
| upStr[tgtLen++] = unicode_char_set::to_upper(str[i]); |
| } |
| } |
| return tgtLen; |
| } |
| */ |
| |
| // 4/10/98 compute the Unicode toLower function |
| NAWchar unicode_char_set::to_lower(const NAWchar x) |
| { |
| if ( IN_RANGE(x, 0x41, 0x5a) ) { // frequently used chars checked first |
| return x + 0x61 - 0x41; |
| } |
| |
| if ( IN_RANGE(x, 0xc0, 0xd6) ) { // frequently used chars checked first |
| return x + 0xe0 - 0xc0; |
| } |
| |
| return binary_search(x, 0, sizeof(unicode_upper2lower_mapping_table)/sizeof(unicode_mapping_t)-1, |
| (unicode_mapping_t*)unicode_upper2lower_mapping_table |
| ); |
| } |
| |
| // |
| // UTF-8 related functions |
| // |
| |
| Int32 IndexOfLastByteOfUTF8CharAtOrBeforePos (const unsigned char *utf8Str, |
| const Int32 utf8StrLenInBytes, |
| const Int32 bytePos) |
| { |
| if (utf8Str == NULL || utf8StrLenInBytes <= 0 || bytePos < 0 || bytePos >= utf8StrLenInBytes) |
| return -1; // error |
| if (IS_7_BIT_ASCII_IN_UTF8_CHAR(utf8Str[bytePos])) |
| return bytePos; |
| Int32 indexOf1stByteOfUtf8Char = IndexOfFirstByteOfUTF8CharAtOrBeforePos(utf8Str, utf8StrLenInBytes, bytePos); |
| if (indexOf1stByteOfUtf8Char < 0) // could not find the first byte in a UTF-8 character in the string |
| return -1; // cannot tell if this is the last byte |
| Int32 byteCountForUtf8Char = UTF8CharLenInBytes(utf8Str[indexOf1stByteOfUtf8Char]); |
| if (byteCountForUtf8Char <= 0) // error |
| return -1; // cannot tell if this is the last byte |
| Int32 indexOfLastByteOfUtf8Char = indexOf1stByteOfUtf8Char + byteCountForUtf8Char - 1; |
| if (indexOfLastByteOfUtf8Char == bytePos) |
| return bytePos; |
| if (indexOf1stByteOfUtf8Char > 0) |
| return IndexOfLastByteOfUTF8CharAtOrBeforePos(utf8Str, utf8StrLenInBytes, |
| indexOf1stByteOfUtf8Char - 1 /*bytePos*/); |
| return -1; // error |
| } |
| |
| Int32 UTF8CharLenInBytes(const unsigned char firstByteOfTheUtf8Char) |
| { |
| if (IS_NOT_1ST_BYTE_IN_UTF8_CHAR(firstByteOfTheUtf8Char)) |
| return 0; // error |
| if (IS_7_BIT_ASCII_IN_UTF8_CHAR(firstByteOfTheUtf8Char)) |
| return 1; |
| if (((firstByteOfTheUtf8Char & 0xE0) >> 5) == 0x06) |
| return 2; |
| if (((firstByteOfTheUtf8Char & 0xF0) >> 4) == 0x0E) |
| return 3; |
| if (((firstByteOfTheUtf8Char & 0xF8) >> 3) == 0x1E) |
| return 4; |
| //if (((firstByteOfTheUtf8Char & 0xFC) >> 2) == 0x3E) |
| // return 5; |
| //if (((firstByteOfTheUtf8Char & 0xFE) >> 1) == 0x7E) |
| // return 6; |
| //if (firstByteOfTheUtf8Char == 0xFE) |
| // return 7; |
| //if (firstByteOfTheUtf8Char == 0xFF) |
| // return 8; |
| return 0; // error |
| } |
| |
| Int32 IndexOfFirstByteOfUTF8CharAtOrBeforePos(const unsigned char *utf8Str, |
| const Int32 utf8StrLenInBytes, |
| const Int32 bytePos) |
| { |
| if (utf8Str == NULL || utf8StrLenInBytes <= 0 || bytePos < 0 || bytePos >= utf8StrLenInBytes) |
| return -1; // error |
| if (IS_7_BIT_ASCII_IN_UTF8_CHAR(utf8Str[bytePos])) |
| return bytePos; |
| Int32 i = bytePos; |
| while (i >= 0 && IS_NOT_1ST_BYTE_IN_UTF8_CHAR(utf8Str[i])) |
| i--; |
| if (i >= 0 && IS_1ST_BYTE_IN_UTF8_CHAR(utf8Str[i])) |
| return i; |
| return -1; // error |
| } |
| |