core/sql/common/unicode_char_set.cpp - trafodion - Git at Google

 /**********************************************************************
 // @@@ START COPYRIGHT @@@
 //
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 //
 // @@@ END COPYRIGHT @@@
 **********************************************************************/

 /* -*-C++-*-
  *****************************************************************************
  *
  * File:         unicode_char_set.cpp
  * RCS:          $Id:
  * Description:  The implementation of unicode_char_set class
  *
  *
  * Created:      7/8/98
  * Modified:     $ $Date: 1998/08/10 16:01:12 $ (GMT)
  * Language:     C++
  * Status:       $State: Exp $
  *
  *
  *
  *
  *****************************************************************************
  */

 #include "unicode_char_set.h"
 #include "BaseTypes.h"

 #include "nawstring.h"

 // 4/10/98  Unicode char/string manipulations

 typedef struct unicode_mapping
 {
   unsigned short code1;
   unsigned short code2;
 } unicode_mapping_t;

 typedef struct unicode_mapping_full
 {
   unsigned short code1;
   unsigned short code2[3];
 } unicode_mapping_full_t;

 // the following three included .h files are generated using
 // the script MiscVOB/i18n/CaseMapping.pl
 static const unicode_mapping_full_t unicode_lower2upper_mapping_table_full[] =
 {
 #include "1n_lt2u.h"
 };

 static const unicode_mapping_t unicode_lower2upper_mapping_table[] =
 {
 #include "11_lt2u.h"
 };


 static const unicode_mapping_t unicode_upper2lower_mapping_table[] =
 {
 #include "11_ut2l.h"
 };


 NAWchar
 binary_search(NAWchar wc, Int32 lower, Int32 upper,
               unicode_mapping_t table[])
 {
    while ( lower <= upper ) {
       Int32 middle = (lower+upper) >> 1;
       if ( table[middle].code1 == wc )
          return table[middle].code2;

       if ( table[middle].code1 < wc )
         lower = middle + 1;
       else
         upper = middle - 1;
    }
    return wc;
 }

 // search the lower to upper full mapping table
 NAWchar*
 binary_search(NAWchar wc, Int32 lower, Int32 upper,
               unicode_mapping_full_t table[])
 {
    while ( lower <= upper ) {
       Int32 middle = (lower+upper) >> 1;
       if ( table[middle].code1 == wc )
          return (NAWchar *)table[middle].code2;

       if ( table[middle].code1 < wc )
         lower = middle + 1;
       else
         upper = middle - 1;
    }
    return NULL;
 }

 //
 // 4/10/98  compute the Unicode upperShift function
 //
 NAWchar unicode_char_set::to_upper(const NAWchar x)
 {
    if ( IN_RANGE(x, 0x61, 0x7a) ) { // frequently used chars checked first
       return x - 0x61 + 0x41;
    }

    if ( IN_RANGE(x, 0xe0, 0xf6) ) { // frequently used chars checked first
       return x - 0xe0 + 0xc0;
    }

    return binary_search(x, 0,
         sizeof(unicode_lower2upper_mapping_table)/sizeof(unicode_mapping_t)-1,
 		        (unicode_mapping_t*)unicode_lower2upper_mapping_table
 		       );
 }

 // full case mapping
 NAWchar* unicode_char_set::to_upper_full(const NAWchar x)
 {
    return binary_search(x, 0,
        sizeof(unicode_lower2upper_mapping_table_full)/sizeof(unicode_mapping_full_t)-1,
 		        (unicode_mapping_full_t*)unicode_lower2upper_mapping_table_full
 		       );
 }

 // This method works the same way as ex_function_upper_unicode::eval()
 // in exp/exp_function_upper_unicode.cpp. This method is being called
 // by ConstValue::toUpper() while applying upper method on constants.
 // The 'len' number of chars in 'str' are upshifted and the result
 // is kept in 'upStr'.
 void
 unicode_char_set::to_upper(NAWchar *str, size_t len, NAWString &upStr)
 {
   NAWchar* tmpWCP = NULL;

   for(size_t i = 0; i < len; ++i) {

     // search against unicode_lower2upper_mapping_table_full
     tmpWCP = unicode_char_set::to_upper_full(str[i]);

     if (tmpWCP) {
       upStr += tmpWCP[0];
       upStr += tmpWCP[1];

       if (tmpWCP[2] != (NAWchar)0) {
         upStr += tmpWCP[2];
       }
     } else {
       // a NULL return from to_upper_full()
       // search against unicode_lower2upper_mapping_table then
       upStr += unicode_char_set::to_upper(str[i]);
     }
   }
 }

 /*
 long unicode_char_set::to_upper(NAWchar *str, long srcLen,
                                 NAWchar *upStr, long maxTgtLen)
 {
   NAWchar *tmpWCP = NULL;
   long tgtLen = 0, i = 0;

   for(i = 0; i < srcLen; ++i)
   {
     // search against unicode_lower2upper_mapping_table_full
     tmpWCP = unicode_char_set::to_upper_full(str[i]);

     if (tmpWCP)
     {
       if(tgtLen >= maxTgtLen - 1) return -1;

       upStr[tgtLen++] = tmpWCP[0];
       upStr[tgtLen++] = tmpWCP[1];

       if (tmpWCP[2] != (NAWchar)0)
       {
         if(tgtLen >= maxTgtLen) return -1;
         upStr[tgtLen++] = tmpWCP[2];
       }
     } else {
       // a NULL return from to_upper_full()
       // search against unicode_lower2upper_mapping_table then
       if(tgtLen >= maxTgtLen) return -1;
       upStr[tgtLen++] = unicode_char_set::to_upper(str[i]);
     }
   }
   return tgtLen;
 }
 */

 // 4/10/98  compute the Unicode toLower function
 NAWchar unicode_char_set::to_lower(const NAWchar x)
 {
   if ( IN_RANGE(x, 0x41, 0x5a) ) { // frequently used chars checked first
      return  x + 0x61 - 0x41;
   }

   if ( IN_RANGE(x, 0xc0, 0xd6) ) { // frequently used chars checked first
      return x + 0xe0 - 0xc0;
   }

    return binary_search(x, 0, sizeof(unicode_upper2lower_mapping_table)/sizeof(unicode_mapping_t)-1,
 		        (unicode_mapping_t*)unicode_upper2lower_mapping_table
 		       );
 }

 //
 // UTF-8 related functions
 //

 Int32 IndexOfLastByteOfUTF8CharAtOrBeforePos (const unsigned char *utf8Str,
                                               const Int32 utf8StrLenInBytes,
                                               const Int32 bytePos)
 {
   if (utf8Str == NULL || utf8StrLenInBytes <= 0 || bytePos < 0 || bytePos >= utf8StrLenInBytes)
     return -1; // error
   if (IS_7_BIT_ASCII_IN_UTF8_CHAR(utf8Str[bytePos]))
     return bytePos;
   Int32 indexOf1stByteOfUtf8Char = IndexOfFirstByteOfUTF8CharAtOrBeforePos(utf8Str, utf8StrLenInBytes, bytePos);
   if (indexOf1stByteOfUtf8Char < 0) // could not find the first byte in a UTF-8 character in the string
     return -1; // cannot tell if this is the last byte
   Int32 byteCountForUtf8Char = UTF8CharLenInBytes(utf8Str[indexOf1stByteOfUtf8Char]);
   if (byteCountForUtf8Char <= 0) // error
     return -1; // cannot tell if this is the last byte
   Int32 indexOfLastByteOfUtf8Char = indexOf1stByteOfUtf8Char + byteCountForUtf8Char - 1;
   if (indexOfLastByteOfUtf8Char == bytePos)
     return bytePos;
   if (indexOf1stByteOfUtf8Char > 0)
     return IndexOfLastByteOfUTF8CharAtOrBeforePos(utf8Str, utf8StrLenInBytes,
                                                   indexOf1stByteOfUtf8Char - 1 /*bytePos*/);
   return -1; // error
 }

 Int32 UTF8CharLenInBytes(const unsigned char firstByteOfTheUtf8Char)
 {
   if (IS_NOT_1ST_BYTE_IN_UTF8_CHAR(firstByteOfTheUtf8Char))
     return 0; // error
   if (IS_7_BIT_ASCII_IN_UTF8_CHAR(firstByteOfTheUtf8Char))
     return 1;
   if (((firstByteOfTheUtf8Char & 0xE0) >> 5) == 0x06)
     return 2;
   if (((firstByteOfTheUtf8Char & 0xF0) >> 4) == 0x0E)
     return 3;
   if (((firstByteOfTheUtf8Char & 0xF8) >> 3) == 0x1E)
     return 4;
 //if (((firstByteOfTheUtf8Char & 0xFC) >> 2) == 0x3E)
 //  return 5;
 //if (((firstByteOfTheUtf8Char & 0xFE) >> 1) == 0x7E)
 //  return 6;
 //if (firstByteOfTheUtf8Char                 == 0xFE)
 //  return 7;
 //if (firstByteOfTheUtf8Char                 == 0xFF)
 //  return 8;
   return 0; // error
 }

 Int32 IndexOfFirstByteOfUTF8CharAtOrBeforePos(const unsigned char *utf8Str,
                                               const Int32 utf8StrLenInBytes,
                                               const Int32 bytePos)
 {
   if (utf8Str == NULL || utf8StrLenInBytes <= 0 || bytePos < 0 || bytePos >= utf8StrLenInBytes)
     return -1; // error
   if (IS_7_BIT_ASCII_IN_UTF8_CHAR(utf8Str[bytePos]))
     return bytePos;
   Int32 i = bytePos;
   while (i >= 0 && IS_NOT_1ST_BYTE_IN_UTF8_CHAR(utf8Str[i]))
     i--;
   if (i >= 0 && IS_1ST_BYTE_IN_UTF8_CHAR(utf8Str[i]))
     return i;
   return -1; // error
 }
	/**********************************************************************
	// @@@ START COPYRIGHT @@@
	//
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.
	//
	// @@@ END COPYRIGHT @@@
	**********************************************************************/

	/* --C++--
	*****************************************************************************
	*
	* File: unicode_char_set.cpp
	* RCS: $Id:
	* Description: The implementation of unicode_char_set class
	*
	*
	* Created: 7/8/98
	* Modified: $ $Date: 1998/08/10 16:01:12 $ (GMT)
	* Language: C++
	* Status: $State: Exp $
	*
	*
	*
	*
	*****************************************************************************
	*/

	#include "unicode_char_set.h"
	#include "BaseTypes.h"

	#include "nawstring.h"

	// 4/10/98 Unicode char/string manipulations

	typedef struct unicode_mapping
	{
	unsigned short code1;
	unsigned short code2;
	} unicode_mapping_t;

	typedef struct unicode_mapping_full
	{
	unsigned short code1;
	unsigned short code2[3];
	} unicode_mapping_full_t;

	// the following three included .h files are generated using
	// the script MiscVOB/i18n/CaseMapping.pl
	static const unicode_mapping_full_t unicode_lower2upper_mapping_table_full[] =
	{
	#include "1n_lt2u.h"
	};

	static const unicode_mapping_t unicode_lower2upper_mapping_table[] =
	{
	#include "11_lt2u.h"
	};


	static const unicode_mapping_t unicode_upper2lower_mapping_table[] =
	{
	#include "11_ut2l.h"
	};


	NAWchar
	binary_search(NAWchar wc, Int32 lower, Int32 upper,
	unicode_mapping_t table[])
	{
	while ( lower <= upper ) {
	Int32 middle = (lower+upper) >> 1;
	if ( table[middle].code1 == wc )
	return table[middle].code2;

	if ( table[middle].code1 < wc )
	lower = middle + 1;
	else
	upper = middle - 1;
	}
	return wc;
	}

	// search the lower to upper full mapping table
	NAWchar*
	binary_search(NAWchar wc, Int32 lower, Int32 upper,
	unicode_mapping_full_t table[])
	{
	while ( lower <= upper ) {
	Int32 middle = (lower+upper) >> 1;
	if ( table[middle].code1 == wc )
	return (NAWchar *)table[middle].code2;

	if ( table[middle].code1 < wc )
	lower = middle + 1;
	else
	upper = middle - 1;
	}
	return NULL;
	}

	//
	// 4/10/98 compute the Unicode upperShift function
	//
	NAWchar unicode_char_set::to_upper(const NAWchar x)
	{
	if ( IN_RANGE(x, 0x61, 0x7a) ) { // frequently used chars checked first
	return x - 0x61 + 0x41;
	}

	if ( IN_RANGE(x, 0xe0, 0xf6) ) { // frequently used chars checked first
	return x - 0xe0 + 0xc0;
	}

	return binary_search(x, 0,
	sizeof(unicode_lower2upper_mapping_table)/sizeof(unicode_mapping_t)-1,
	(unicode_mapping_t*)unicode_lower2upper_mapping_table
	);
	}

	// full case mapping
	NAWchar* unicode_char_set::to_upper_full(const NAWchar x)
	{
	return binary_search(x, 0,
	sizeof(unicode_lower2upper_mapping_table_full)/sizeof(unicode_mapping_full_t)-1,
	(unicode_mapping_full_t*)unicode_lower2upper_mapping_table_full
	);
	}

	// This method works the same way as ex_function_upper_unicode::eval()
	// in exp/exp_function_upper_unicode.cpp. This method is being called
	// by ConstValue::toUpper() while applying upper method on constants.
	// The 'len' number of chars in 'str' are upshifted and the result
	// is kept in 'upStr'.
	void
	unicode_char_set::to_upper(NAWchar *str, size_t len, NAWString &upStr)
	{
	NAWchar* tmpWCP = NULL;

	for(size_t i = 0; i < len; ++i) {

	// search against unicode_lower2upper_mapping_table_full
	tmpWCP = unicode_char_set::to_upper_full(str[i]);

	if (tmpWCP) {
	upStr += tmpWCP[0];
	upStr += tmpWCP[1];

	if (tmpWCP[2] != (NAWchar)0) {
	upStr += tmpWCP[2];
	}
	} else {
	// a NULL return from to_upper_full()
	// search against unicode_lower2upper_mapping_table then
	upStr += unicode_char_set::to_upper(str[i]);
	}
	}
	}

	/*
	long unicode_char_set::to_upper(NAWchar *str, long srcLen,
	NAWchar *upStr, long maxTgtLen)
	{
	NAWchar *tmpWCP = NULL;
	long tgtLen = 0, i = 0;

	for(i = 0; i < srcLen; ++i)
	{
	// search against unicode_lower2upper_mapping_table_full
	tmpWCP = unicode_char_set::to_upper_full(str[i]);

	if (tmpWCP)
	{
	if(tgtLen >= maxTgtLen - 1) return -1;

	upStr[tgtLen++] = tmpWCP[0];
	upStr[tgtLen++] = tmpWCP[1];

	if (tmpWCP[2] != (NAWchar)0)
	{
	if(tgtLen >= maxTgtLen) return -1;
	upStr[tgtLen++] = tmpWCP[2];
	}
	} else {
	// a NULL return from to_upper_full()
	// search against unicode_lower2upper_mapping_table then
	if(tgtLen >= maxTgtLen) return -1;
	upStr[tgtLen++] = unicode_char_set::to_upper(str[i]);
	}
	}
	return tgtLen;
	}
	*/

	// 4/10/98 compute the Unicode toLower function
	NAWchar unicode_char_set::to_lower(const NAWchar x)
	{
	if ( IN_RANGE(x, 0x41, 0x5a) ) { // frequently used chars checked first
	return x + 0x61 - 0x41;
	}

	if ( IN_RANGE(x, 0xc0, 0xd6) ) { // frequently used chars checked first
	return x + 0xe0 - 0xc0;
	}

	return binary_search(x, 0, sizeof(unicode_upper2lower_mapping_table)/sizeof(unicode_mapping_t)-1,
	(unicode_mapping_t*)unicode_upper2lower_mapping_table
	);
	}

	//
	// UTF-8 related functions
	//

	Int32 IndexOfLastByteOfUTF8CharAtOrBeforePos (const unsigned char *utf8Str,
	const Int32 utf8StrLenInBytes,
	const Int32 bytePos)
	{
	if (utf8Str == NULL \|\| utf8StrLenInBytes <= 0 \|\| bytePos < 0 \|\| bytePos >= utf8StrLenInBytes)
	return -1; // error
	if (IS_7_BIT_ASCII_IN_UTF8_CHAR(utf8Str[bytePos]))
	return bytePos;
	Int32 indexOf1stByteOfUtf8Char = IndexOfFirstByteOfUTF8CharAtOrBeforePos(utf8Str, utf8StrLenInBytes, bytePos);
	if (indexOf1stByteOfUtf8Char < 0) // could not find the first byte in a UTF-8 character in the string
	return -1; // cannot tell if this is the last byte
	Int32 byteCountForUtf8Char = UTF8CharLenInBytes(utf8Str[indexOf1stByteOfUtf8Char]);
	if (byteCountForUtf8Char <= 0) // error
	return -1; // cannot tell if this is the last byte
	Int32 indexOfLastByteOfUtf8Char = indexOf1stByteOfUtf8Char + byteCountForUtf8Char - 1;
	if (indexOfLastByteOfUtf8Char == bytePos)
	return bytePos;
	if (indexOf1stByteOfUtf8Char > 0)
	return IndexOfLastByteOfUTF8CharAtOrBeforePos(utf8Str, utf8StrLenInBytes,
	indexOf1stByteOfUtf8Char - 1 /bytePos/);
	return -1; // error
	}

	Int32 UTF8CharLenInBytes(const unsigned char firstByteOfTheUtf8Char)
	{
	if (IS_NOT_1ST_BYTE_IN_UTF8_CHAR(firstByteOfTheUtf8Char))
	return 0; // error
	if (IS_7_BIT_ASCII_IN_UTF8_CHAR(firstByteOfTheUtf8Char))
	return 1;
	if (((firstByteOfTheUtf8Char & 0xE0) >> 5) == 0x06)
	return 2;
	if (((firstByteOfTheUtf8Char & 0xF0) >> 4) == 0x0E)
	return 3;
	if (((firstByteOfTheUtf8Char & 0xF8) >> 3) == 0x1E)
	return 4;
	//if (((firstByteOfTheUtf8Char & 0xFC) >> 2) == 0x3E)
	// return 5;
	//if (((firstByteOfTheUtf8Char & 0xFE) >> 1) == 0x7E)
	// return 6;
	//if (firstByteOfTheUtf8Char == 0xFE)
	// return 7;
	//if (firstByteOfTheUtf8Char == 0xFF)
	// return 8;
	return 0; // error
	}

	Int32 IndexOfFirstByteOfUTF8CharAtOrBeforePos(const unsigned char *utf8Str,
	const Int32 utf8StrLenInBytes,
	const Int32 bytePos)
	{
	if (utf8Str == NULL \|\| utf8StrLenInBytes <= 0 \|\| bytePos < 0 \|\| bytePos >= utf8StrLenInBytes)
	return -1; // error
	if (IS_7_BIT_ASCII_IN_UTF8_CHAR(utf8Str[bytePos]))
	return bytePos;
	Int32 i = bytePos;
	while (i >= 0 && IS_NOT_1ST_BYTE_IN_UTF8_CHAR(utf8Str[i]))
	i--;
	if (i >= 0 && IS_1ST_BYTE_IN_UTF8_CHAR(utf8Str[i]))
	return i;
	return -1; // error
	}