blob: 8e982927da7453147287de858307755bbc608dd0 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// @@@ END COPYRIGHT @@@
/* -*-C++-*-
* File: unicode_char_set.h
* RCS: $Id:
* Description: The implementation of unicode_char_set class
* Created: 7/8/98
* Modified: $ $Date: 2006/11/01 01:38:09 $ (GMT)
* Language: C++
* Status: $State: Exp $
// 4/10/98 A simple class devoting to the concept of Unicode
#include <string.h>
#include <stddef.h>
#include <limits.h>
#include "Platform.h"
#include "NAWinNT.h"
#include "NABoolean.h"
// Forward declaration
class NAWString;
class unicode_char_set
unicode_char_set() {};
virtual ~unicode_char_set() {};
static NAWchar to_upper(const NAWchar);
static void to_upper(NAWchar *, size_t, NAWString &);
static Lng32 to_upper(NAWchar *, Lng32 , NAWchar *, Lng32);
// For full case mapping, an lower case character may map to
// more than one upper case character
static NAWchar* to_upper_full(const NAWchar);
static NAWchar to_lower(const NAWchar);
static NAWchar space_char() { return unicode_char_set::SPACE; };
static NAWchar null_char() {return 0;};
static NAWchar underscore_char() { return unicode_char_set::UNDERSCORE; };
static NAWchar percent_char() { return unicode_char_set::PERCENT; };
static NAWchar maxCharValue() { return USHRT_MAX; };
static short bytesPerChar() { return sizeof(NAWchar); };
static NABoolean isValidUCS2CodePoint(NAWchar wc)
UInt32 codePointValue = (UInt32)wc;
return ( codePointValue < 0xFFFE );
*** Comment out the following functions because they are not yet used. ***
// Determines if a NAWchar is a UTF-16 high surrogate code point,
// ranging from 0xd800 to 0xdbff, inclusive.
static NABoolean isHighSurrogateCodePoint(NAWchar wc)
UInt32 codePointValue = (UInt32)wc;
return ( codePointValue >= 0xD800 && codePointValue <= 0xDBFF );
// Determines if a character is a UTF-16 low surrogate code point,
// ranging from 0xdc00 to 0xdfff, inclusive.
static NABoolean isLowSurrogateCodePoint(NAWchar wc)
UInt32 codePointValue = (UInt32)wc;
return ( codePointValue >= 0xDC00 && codePointValue <= 0xDFFF );
// Determines if the specified code units form a UTF-16 surrogate pair.
static NABoolean isUTF16SurrogatePair(NAWchar hwc, NAWchar lwc)
return ( isHighSurrogateCodePoint(hwc) && isLowSurrogateCodePoint(lwc) );
*** End of commented-out code ***
// UTF-8 related macro definitions and function declarations
#define IS_7_BIT_ASCII_IN_UTF8_CHAR(x) ((((x) & 0x80) >> 7) == 0)
#define IS_NOT_1ST_BYTE_IN_UTF8_CHAR(x) ((((x) & 0xC0) >> 6) == 2)
#define IS_1ST_BYTE_IN_UTF8_CHAR(x) ((((x) & 0xC0) >> 6) != 2)
Int32 UTF8CharLenInBytes(const unsigned char firstByteOfTheUtf8Char); // returns 0 on errors
Int32 IndexOfLastByteOfUTF8CharAtOrBeforePos (const unsigned char *utf8Str,
const Int32 utf8StrLenInBytes,
const Int32 bytePos); // returns -1 on errors
Int32 IndexOfFirstByteOfUTF8CharAtOrBeforePos(const unsigned char *utf8Str,
const Int32 utf8StrLenInBytes,
const Int32 bytePos); // returns -1 on errors