blob: 8ecba4e337eabd9480abe6889a70c3dd5ff4098b [file] [log] [blame]
/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
/* -*-C++-*-
*****************************************************************************
*
* File: unicode_char_set.cpp
* RCS: $Id:
* Description: The implementation of unicode_char_set class
*
*
* Created: 7/8/98
* Modified: $ $Date: 1998/08/10 16:01:12 $ (GMT)
* Language: C++
* Status: $State: Exp $
*
*
*
*
*****************************************************************************
*/
#include "unicode_char_set.h"
#include "BaseTypes.h"
#include "nawstring.h"
// 4/10/98 Unicode char/string manipulations
typedef struct unicode_mapping
{
unsigned short code1;
unsigned short code2;
} unicode_mapping_t;
typedef struct unicode_mapping_full
{
unsigned short code1;
unsigned short code2[3];
} unicode_mapping_full_t;
// the following three included .h files are generated using
// the script MiscVOB/i18n/CaseMapping.pl
static const unicode_mapping_full_t unicode_lower2upper_mapping_table_full[] =
{
#include "1n_lt2u.h"
};
static const unicode_mapping_t unicode_lower2upper_mapping_table[] =
{
#include "11_lt2u.h"
};
static const unicode_mapping_t unicode_upper2lower_mapping_table[] =
{
#include "11_ut2l.h"
};
NAWchar
binary_search(NAWchar wc, Int32 lower, Int32 upper,
unicode_mapping_t table[])
{
while ( lower <= upper ) {
Int32 middle = (lower+upper) >> 1;
if ( table[middle].code1 == wc )
return table[middle].code2;
if ( table[middle].code1 < wc )
lower = middle + 1;
else
upper = middle - 1;
}
return wc;
}
// search the lower to upper full mapping table
NAWchar*
binary_search(NAWchar wc, Int32 lower, Int32 upper,
unicode_mapping_full_t table[])
{
while ( lower <= upper ) {
Int32 middle = (lower+upper) >> 1;
if ( table[middle].code1 == wc )
return (NAWchar *)table[middle].code2;
if ( table[middle].code1 < wc )
lower = middle + 1;
else
upper = middle - 1;
}
return NULL;
}
//
// 4/10/98 compute the Unicode upperShift function
//
NAWchar unicode_char_set::to_upper(const NAWchar x)
{
if ( IN_RANGE(x, 0x61, 0x7a) ) { // frequently used chars checked first
return x - 0x61 + 0x41;
}
if ( IN_RANGE(x, 0xe0, 0xf6) ) { // frequently used chars checked first
return x - 0xe0 + 0xc0;
}
return binary_search(x, 0,
sizeof(unicode_lower2upper_mapping_table)/sizeof(unicode_mapping_t)-1,
(unicode_mapping_t*)unicode_lower2upper_mapping_table
);
}
// full case mapping
NAWchar* unicode_char_set::to_upper_full(const NAWchar x)
{
return binary_search(x, 0,
sizeof(unicode_lower2upper_mapping_table_full)/sizeof(unicode_mapping_full_t)-1,
(unicode_mapping_full_t*)unicode_lower2upper_mapping_table_full
);
}
// This method works the same way as ex_function_upper_unicode::eval()
// in exp/exp_function_upper_unicode.cpp. This method is being called
// by ConstValue::toUpper() while applying upper method on constants.
// The 'len' number of chars in 'str' are upshifted and the result
// is kept in 'upStr'.
void
unicode_char_set::to_upper(NAWchar *str, size_t len, NAWString &upStr)
{
NAWchar* tmpWCP = NULL;
for(size_t i = 0; i < len; ++i) {
// search against unicode_lower2upper_mapping_table_full
tmpWCP = unicode_char_set::to_upper_full(str[i]);
if (tmpWCP) {
upStr += tmpWCP[0];
upStr += tmpWCP[1];
if (tmpWCP[2] != (NAWchar)0) {
upStr += tmpWCP[2];
}
} else {
// a NULL return from to_upper_full()
// search against unicode_lower2upper_mapping_table then
upStr += unicode_char_set::to_upper(str[i]);
}
}
}
/*
long unicode_char_set::to_upper(NAWchar *str, long srcLen,
NAWchar *upStr, long maxTgtLen)
{
NAWchar *tmpWCP = NULL;
long tgtLen = 0, i = 0;
for(i = 0; i < srcLen; ++i)
{
// search against unicode_lower2upper_mapping_table_full
tmpWCP = unicode_char_set::to_upper_full(str[i]);
if (tmpWCP)
{
if(tgtLen >= maxTgtLen - 1) return -1;
upStr[tgtLen++] = tmpWCP[0];
upStr[tgtLen++] = tmpWCP[1];
if (tmpWCP[2] != (NAWchar)0)
{
if(tgtLen >= maxTgtLen) return -1;
upStr[tgtLen++] = tmpWCP[2];
}
} else {
// a NULL return from to_upper_full()
// search against unicode_lower2upper_mapping_table then
if(tgtLen >= maxTgtLen) return -1;
upStr[tgtLen++] = unicode_char_set::to_upper(str[i]);
}
}
return tgtLen;
}
*/
// 4/10/98 compute the Unicode toLower function
NAWchar unicode_char_set::to_lower(const NAWchar x)
{
if ( IN_RANGE(x, 0x41, 0x5a) ) { // frequently used chars checked first
return x + 0x61 - 0x41;
}
if ( IN_RANGE(x, 0xc0, 0xd6) ) { // frequently used chars checked first
return x + 0xe0 - 0xc0;
}
return binary_search(x, 0, sizeof(unicode_upper2lower_mapping_table)/sizeof(unicode_mapping_t)-1,
(unicode_mapping_t*)unicode_upper2lower_mapping_table
);
}
//
// UTF-8 related functions
//
Int32 IndexOfLastByteOfUTF8CharAtOrBeforePos (const unsigned char *utf8Str,
const Int32 utf8StrLenInBytes,
const Int32 bytePos)
{
if (utf8Str == NULL || utf8StrLenInBytes <= 0 || bytePos < 0 || bytePos >= utf8StrLenInBytes)
return -1; // error
if (IS_7_BIT_ASCII_IN_UTF8_CHAR(utf8Str[bytePos]))
return bytePos;
Int32 indexOf1stByteOfUtf8Char = IndexOfFirstByteOfUTF8CharAtOrBeforePos(utf8Str, utf8StrLenInBytes, bytePos);
if (indexOf1stByteOfUtf8Char < 0) // could not find the first byte in a UTF-8 character in the string
return -1; // cannot tell if this is the last byte
Int32 byteCountForUtf8Char = UTF8CharLenInBytes(utf8Str[indexOf1stByteOfUtf8Char]);
if (byteCountForUtf8Char <= 0) // error
return -1; // cannot tell if this is the last byte
Int32 indexOfLastByteOfUtf8Char = indexOf1stByteOfUtf8Char + byteCountForUtf8Char - 1;
if (indexOfLastByteOfUtf8Char == bytePos)
return bytePos;
if (indexOf1stByteOfUtf8Char > 0)
return IndexOfLastByteOfUTF8CharAtOrBeforePos(utf8Str, utf8StrLenInBytes,
indexOf1stByteOfUtf8Char - 1 /*bytePos*/);
return -1; // error
}
Int32 UTF8CharLenInBytes(const unsigned char firstByteOfTheUtf8Char)
{
if (IS_NOT_1ST_BYTE_IN_UTF8_CHAR(firstByteOfTheUtf8Char))
return 0; // error
if (IS_7_BIT_ASCII_IN_UTF8_CHAR(firstByteOfTheUtf8Char))
return 1;
if (((firstByteOfTheUtf8Char & 0xE0) >> 5) == 0x06)
return 2;
if (((firstByteOfTheUtf8Char & 0xF0) >> 4) == 0x0E)
return 3;
if (((firstByteOfTheUtf8Char & 0xF8) >> 3) == 0x1E)
return 4;
//if (((firstByteOfTheUtf8Char & 0xFC) >> 2) == 0x3E)
// return 5;
//if (((firstByteOfTheUtf8Char & 0xFE) >> 1) == 0x7E)
// return 6;
//if (firstByteOfTheUtf8Char == 0xFE)
// return 7;
//if (firstByteOfTheUtf8Char == 0xFF)
// return 8;
return 0; // error
}
Int32 IndexOfFirstByteOfUTF8CharAtOrBeforePos(const unsigned char *utf8Str,
const Int32 utf8StrLenInBytes,
const Int32 bytePos)
{
if (utf8Str == NULL || utf8StrLenInBytes <= 0 || bytePos < 0 || bytePos >= utf8StrLenInBytes)
return -1; // error
if (IS_7_BIT_ASCII_IN_UTF8_CHAR(utf8Str[bytePos]))
return bytePos;
Int32 i = bytePos;
while (i >= 0 && IS_NOT_1ST_BYTE_IN_UTF8_CHAR(utf8Str[i]))
i--;
if (i >= 0 && IS_1ST_BYTE_IN_UTF8_CHAR(utf8Str[i]))
return i;
return -1; // error
}