blob: 532efaf47898b4f111a9b3fe68ee864d48e85af9 [file] [log] [blame]
/**************************************************************
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*************************************************************/
#include <com/sun/star/i18n/UnicodeType.hpp>
#include <com/sun/star/i18n/KCharacterType.hpp>
#include <i18nutil/unicode.hxx>
#include "unicode_data.h"
using namespace ::com::sun::star::i18n;
static ScriptTypeList defaultTypeList[] = {
{ UnicodeScript_kBasicLatin,
UnicodeScript_kBasicLatin,
UnicodeScript_kBasicLatin }, // 0,
{ UnicodeScript_kLatin1Supplement,
UnicodeScript_kLatin1Supplement,
UnicodeScript_kLatin1Supplement },// 1,
{ UnicodeScript_kLatinExtendedA,
UnicodeScript_kLatinExtendedA,
UnicodeScript_kLatinExtendedA }, // 2,
{ UnicodeScript_kLatinExtendedB,
UnicodeScript_kLatinExtendedB,
UnicodeScript_kLatinExtendedB }, // 3,
{ UnicodeScript_kIPAExtension,
UnicodeScript_kIPAExtension,
UnicodeScript_kIPAExtension }, // 4,
{ UnicodeScript_kSpacingModifier,
UnicodeScript_kSpacingModifier,
UnicodeScript_kSpacingModifier }, // 5,
{ UnicodeScript_kCombiningDiacritical,
UnicodeScript_kCombiningDiacritical,
UnicodeScript_kCombiningDiacritical }, // 6,
{ UnicodeScript_kGreek,
UnicodeScript_kGreek,
UnicodeScript_kGreek }, // 7,
{ UnicodeScript_kCyrillic,
UnicodeScript_kCyrillic,
UnicodeScript_kCyrillic }, // 8,
{ UnicodeScript_kArmenian,
UnicodeScript_kArmenian,
UnicodeScript_kArmenian }, // 9,
{ UnicodeScript_kHebrew,
UnicodeScript_kHebrew,
UnicodeScript_kHebrew }, // 10,
{ UnicodeScript_kArabic,
UnicodeScript_kArabic,
UnicodeScript_kArabic }, // 11,
{ UnicodeScript_kSyriac,
UnicodeScript_kSyriac,
UnicodeScript_kSyriac }, // 12,
{ UnicodeScript_kThaana,
UnicodeScript_kThaana,
UnicodeScript_kThaana }, // 13,
{ UnicodeScript_kDevanagari,
UnicodeScript_kDevanagari,
UnicodeScript_kDevanagari }, // 14,
{ UnicodeScript_kBengali,
UnicodeScript_kBengali,
UnicodeScript_kBengali }, // 15,
{ UnicodeScript_kGurmukhi,
UnicodeScript_kGurmukhi,
UnicodeScript_kGurmukhi }, // 16,
{ UnicodeScript_kGujarati,
UnicodeScript_kGujarati,
UnicodeScript_kGujarati }, // 17,
{ UnicodeScript_kOriya,
UnicodeScript_kOriya,
UnicodeScript_kOriya }, // 18,
{ UnicodeScript_kTamil,
UnicodeScript_kTamil,
UnicodeScript_kTamil }, // 19,
{ UnicodeScript_kTelugu,
UnicodeScript_kTelugu,
UnicodeScript_kTelugu }, // 20,
{ UnicodeScript_kKannada,
UnicodeScript_kKannada,
UnicodeScript_kKannada }, // 21,
{ UnicodeScript_kMalayalam,
UnicodeScript_kMalayalam,
UnicodeScript_kMalayalam }, // 22,
{ UnicodeScript_kSinhala,
UnicodeScript_kSinhala,
UnicodeScript_kSinhala }, // 23,
{ UnicodeScript_kThai,
UnicodeScript_kThai,
UnicodeScript_kThai }, // 24,
{ UnicodeScript_kLao,
UnicodeScript_kLao,
UnicodeScript_kLao }, // 25,
{ UnicodeScript_kTibetan,
UnicodeScript_kTibetan,
UnicodeScript_kTibetan }, // 26,
{ UnicodeScript_kMyanmar,
UnicodeScript_kMyanmar,
UnicodeScript_kMyanmar }, // 27,
{ UnicodeScript_kGeorgian,
UnicodeScript_kGeorgian,
UnicodeScript_kGeorgian }, // 28,
{ UnicodeScript_kHangulJamo,
UnicodeScript_kHangulJamo,
UnicodeScript_kHangulJamo }, // 29,
{ UnicodeScript_kEthiopic,
UnicodeScript_kEthiopic,
UnicodeScript_kEthiopic }, // 30,
{ UnicodeScript_kCherokee,
UnicodeScript_kCherokee,
UnicodeScript_kCherokee }, // 31,
{ UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
{ UnicodeScript_kOgham,
UnicodeScript_kOgham,
UnicodeScript_kOgham }, // 33,
{ UnicodeScript_kRunic,
UnicodeScript_kRunic,
UnicodeScript_kRunic }, // 34,
{ UnicodeScript_kKhmer,
UnicodeScript_kKhmer,
UnicodeScript_kKhmer }, // 35,
{ UnicodeScript_kMongolian,
UnicodeScript_kMongolian,
UnicodeScript_kMongolian }, // 36,
{ UnicodeScript_kLatinExtendedAdditional,
UnicodeScript_kLatinExtendedAdditional,
UnicodeScript_kLatinExtendedAdditional }, // 37,
{ UnicodeScript_kGreekExtended,
UnicodeScript_kGreekExtended,
UnicodeScript_kGreekExtended }, // 38,
{ UnicodeScript_kGeneralPunctuation,
UnicodeScript_kGeneralPunctuation,
UnicodeScript_kGeneralPunctuation }, // 39,
{ UnicodeScript_kSuperSubScript,
UnicodeScript_kSuperSubScript,
UnicodeScript_kSuperSubScript }, // 40,
{ UnicodeScript_kCurrencySymbolScript,
UnicodeScript_kCurrencySymbolScript,
UnicodeScript_kCurrencySymbolScript }, // 41,
{ UnicodeScript_kSymbolCombiningMark,
UnicodeScript_kSymbolCombiningMark,
UnicodeScript_kSymbolCombiningMark }, // 42,
{ UnicodeScript_kLetterlikeSymbol,
UnicodeScript_kLetterlikeSymbol,
UnicodeScript_kLetterlikeSymbol }, // 43,
{ UnicodeScript_kNumberForm,
UnicodeScript_kNumberForm,
UnicodeScript_kNumberForm }, // 44,
{ UnicodeScript_kArrow,
UnicodeScript_kArrow,
UnicodeScript_kArrow }, // 45,
{ UnicodeScript_kMathOperator,
UnicodeScript_kMathOperator,
UnicodeScript_kMathOperator }, // 46,
{ UnicodeScript_kMiscTechnical,
UnicodeScript_kMiscTechnical,
UnicodeScript_kMiscTechnical }, // 47,
{ UnicodeScript_kControlPicture,
UnicodeScript_kControlPicture,
UnicodeScript_kControlPicture }, // 48,
{ UnicodeScript_kOpticalCharacter,
UnicodeScript_kOpticalCharacter,
UnicodeScript_kOpticalCharacter }, // 49,
{ UnicodeScript_kEnclosedAlphanumeric,
UnicodeScript_kEnclosedAlphanumeric,
UnicodeScript_kEnclosedAlphanumeric }, // 50,
{ UnicodeScript_kBoxDrawing,
UnicodeScript_kBoxDrawing,
UnicodeScript_kBoxDrawing }, // 51,
{ UnicodeScript_kBlockElement,
UnicodeScript_kBlockElement,
UnicodeScript_kBlockElement }, // 52,
{ UnicodeScript_kGeometricShape,
UnicodeScript_kGeometricShape,
UnicodeScript_kGeometricShape }, // 53,
{ UnicodeScript_kMiscSymbol,
UnicodeScript_kMiscSymbol,
UnicodeScript_kMiscSymbol }, // 54,
{ UnicodeScript_kDingbat,
UnicodeScript_kDingbat,
UnicodeScript_kDingbat }, // 55,
{ UnicodeScript_kBraillePatterns,
UnicodeScript_kBraillePatterns,
UnicodeScript_kBraillePatterns }, // 56,
{ UnicodeScript_kCJKRadicalsSupplement,
UnicodeScript_kCJKRadicalsSupplement,
UnicodeScript_kCJKRadicalsSupplement }, // 57,
{ UnicodeScript_kKangxiRadicals,
UnicodeScript_kKangxiRadicals,
UnicodeScript_kKangxiRadicals }, // 58,
{ UnicodeScript_kIdeographicDescriptionCharacters,
UnicodeScript_kIdeographicDescriptionCharacters,
UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
{ UnicodeScript_kCJKSymbolPunctuation,
UnicodeScript_kCJKSymbolPunctuation,
UnicodeScript_kCJKSymbolPunctuation }, // 60,
{ UnicodeScript_kHiragana,
UnicodeScript_kHiragana,
UnicodeScript_kHiragana }, // 61,
{ UnicodeScript_kKatakana,
UnicodeScript_kKatakana,
UnicodeScript_kKatakana }, // 62,
{ UnicodeScript_kBopomofo,
UnicodeScript_kBopomofo,
UnicodeScript_kBopomofo }, // 63,
{ UnicodeScript_kHangulCompatibilityJamo,
UnicodeScript_kHangulCompatibilityJamo,
UnicodeScript_kHangulCompatibilityJamo }, // 64,
{ UnicodeScript_kKanbun,
UnicodeScript_kKanbun,
UnicodeScript_kKanbun }, // 65,
{ UnicodeScript_kBopomofoExtended,
UnicodeScript_kBopomofoExtended,
UnicodeScript_kBopomofoExtended }, // 66,
{ UnicodeScript_kEnclosedCJKLetterMonth,
UnicodeScript_kEnclosedCJKLetterMonth,
UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
{ UnicodeScript_kCJKCompatibility,
UnicodeScript_kCJKCompatibility,
UnicodeScript_kCJKCompatibility }, // 68,
{ UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
{ UnicodeScript_kCJKUnifiedIdeograph,
UnicodeScript_kCJKUnifiedIdeograph,
UnicodeScript_kCJKUnifiedIdeograph }, // 70,
{ UnicodeScript_kYiSyllables,
UnicodeScript_kYiSyllables,
UnicodeScript_kYiSyllables }, // 71,
{ UnicodeScript_kYiRadicals,
UnicodeScript_kYiRadicals,
UnicodeScript_kYiRadicals }, // 72,
{ UnicodeScript_kHangulSyllable,
UnicodeScript_kHangulSyllable,
UnicodeScript_kHangulSyllable }, // 73,
{ UnicodeScript_kHighSurrogate,
UnicodeScript_kHighSurrogate,
UnicodeScript_kHighSurrogate }, // 74,
{ UnicodeScript_kHighPrivateUseSurrogate,
UnicodeScript_kHighPrivateUseSurrogate,
UnicodeScript_kHighPrivateUseSurrogate }, // 75,
{ UnicodeScript_kLowSurrogate,
UnicodeScript_kLowSurrogate,
UnicodeScript_kLowSurrogate }, // 76,
{ UnicodeScript_kPrivateUse,
UnicodeScript_kPrivateUse,
UnicodeScript_kPrivateUse }, // 77,
{ UnicodeScript_kCJKCompatibilityIdeograph,
UnicodeScript_kCJKCompatibilityIdeograph,
UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
{ UnicodeScript_kAlphabeticPresentation,
UnicodeScript_kAlphabeticPresentation,
UnicodeScript_kAlphabeticPresentation }, // 79,
{ UnicodeScript_kArabicPresentationA,
UnicodeScript_kArabicPresentationA,
UnicodeScript_kArabicPresentationA }, // 80,
{ UnicodeScript_kCombiningHalfMark,
UnicodeScript_kCombiningHalfMark,
UnicodeScript_kCombiningHalfMark }, // 81,
{ UnicodeScript_kCJKCompatibilityForm,
UnicodeScript_kCJKCompatibilityForm,
UnicodeScript_kCJKCompatibilityForm }, // 82,
{ UnicodeScript_kSmallFormVariant,
UnicodeScript_kSmallFormVariant,
UnicodeScript_kSmallFormVariant }, // 83,
{ UnicodeScript_kArabicPresentationB,
UnicodeScript_kArabicPresentationB,
UnicodeScript_kArabicPresentationB }, // 84,
{ UnicodeScript_kNoScript,
UnicodeScript_kNoScript,
UnicodeScript_kNoScript }, // 85,
{ UnicodeScript_kHalfwidthFullwidthForm,
UnicodeScript_kHalfwidthFullwidthForm,
UnicodeScript_kHalfwidthFullwidthForm }, // 86,
{ UnicodeScript_kScriptCount,
UnicodeScript_kScriptCount,
UnicodeScript_kNoScript } // 87,
};
sal_Int16 SAL_CALL
unicode::getUnicodeScriptType( const sal_Unicode ch, ScriptTypeList* typeList, sal_Int16 unknownType ) {
if (!typeList) {
typeList = defaultTypeList;
unknownType = UnicodeScript_kNoScript;
}
sal_Int16 i = 0, type = typeList[0].to;
while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
type = typeList[++i].to;
}
return (type < UnicodeScript_kScriptCount &&
ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
typeList[i].value : unknownType;
}
sal_Bool SAL_CALL
unicode::isUnicodeScriptType( const sal_Unicode ch, sal_Int16 type) {
return ch >= UnicodeScriptType[type][UnicodeScriptTypeFrom] &&
ch <= UnicodeScriptType[type][UnicodeScriptTypeTo];
}
sal_Unicode SAL_CALL
unicode::getUnicodeScriptStart( UnicodeScript type) {
return UnicodeScriptType[type][UnicodeScriptTypeFrom];
}
sal_Unicode SAL_CALL
unicode::getUnicodeScriptEnd( UnicodeScript type) {
return UnicodeScriptType[type][UnicodeScriptTypeTo];
}
sal_Int16 SAL_CALL
unicode::getUnicodeType( const sal_Unicode ch ) {
static sal_Unicode c = 0x00;
static sal_Int16 r = 0x00;
if (ch == c) return r;
else c = ch;
sal_Int16 address = UnicodeTypeIndex[ch >> 8];
return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
}
sal_uInt8 SAL_CALL
unicode::getUnicodeDirection( const sal_Unicode ch ) {
static sal_Unicode c = 0x00;
static sal_uInt8 r = 0x00;
if (ch == c) return r;
else c = ch;
sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
}
#define bit(name) (1 << name)
#define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
#define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
#define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
#define DIGITMASK bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\
bit(UnicodeType::LETTER_NUMBER)|\
bit(UnicodeType::OTHER_NUMBER)
#define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
bit(UnicodeType::MODIFIER_LETTER)|\
bit(UnicodeType::OTHER_LETTER)
#define BASEMASK DIGITMASK|ALPHAMASK|\
bit(UnicodeType::NON_SPACING_MARK)|\
bit(UnicodeType::ENCLOSING_MARK)|\
bit(UnicodeType::COMBINING_SPACING_MARK)
#define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
bit(UnicodeType::LINE_SEPARATOR)|\
bit(UnicodeType::PARAGRAPH_SEPARATOR)
#define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\
bit(UnicodeType::INITIAL_PUNCTUATION)|\
bit(UnicodeType::FINAL_PUNCTUATION)|\
bit(UnicodeType::CONNECTOR_PUNCTUATION)|\
bit(UnicodeType::OTHER_PUNCTUATION)
#define SYMBOLMASK bit(UnicodeType::MATH_SYMBOL)|\
bit(UnicodeType::CURRENCY_SYMBOL)|\
bit(UnicodeType::MODIFIER_SYMBOL)|\
bit(UnicodeType::OTHER_SYMBOL)
#define PRINTMASK BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK
#define CONTROLMASK bit(UnicodeType::CONTROL)|\
bit(UnicodeType::FORMAT)|\
bit(UnicodeType::LINE_SEPARATOR)|\
bit(UnicodeType::PARAGRAPH_SEPARATOR)
#define IsType(func, mask) \
sal_Bool SAL_CALL func( const sal_Unicode ch) {\
return (bit(getUnicodeType(ch)) & (mask)) != 0;\
}
IsType(unicode::isUpper, UPPERMASK)
IsType(unicode::isLower, LOWERMASK)
IsType(unicode::isTitle, DIGITMASK)
IsType(unicode::isControl, CONTROLMASK)
IsType(unicode::isPrint, PRINTMASK)
IsType(unicode::isAlpha, ALPHAMASK)
IsType(unicode::isDigit, DIGITMASK)
IsType(unicode::isAlphaDigit, ALPHAMASK|DIGITMASK)
IsType(unicode::isSpace, SPACEMASK)
IsType(unicode::isBase, BASEMASK)
IsType(unicode::isPunctuation, PUNCTUATIONMASK)
#define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
sal_Bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
}
sal_Int32 SAL_CALL unicode::getCharType( const sal_Unicode ch )
{
using namespace ::com::sun::star::i18n::KCharacterType;
switch ( getUnicodeType( ch ) ) {
// Upper
case UnicodeType::UPPERCASE_LETTER :
return UPPER|LETTER|PRINTABLE|BASE_FORM;
// Lower
case UnicodeType::LOWERCASE_LETTER :
return LOWER|LETTER|PRINTABLE|BASE_FORM;
// Title
case UnicodeType::TITLECASE_LETTER :
return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM;
// Letter
case UnicodeType::MODIFIER_LETTER :
case UnicodeType::OTHER_LETTER :
return LETTER|PRINTABLE|BASE_FORM;
// Digit
case UnicodeType::DECIMAL_DIGIT_NUMBER:
case UnicodeType::LETTER_NUMBER:
case UnicodeType::OTHER_NUMBER:
return DIGIT|PRINTABLE|BASE_FORM;
// Base
case UnicodeType::NON_SPACING_MARK:
case UnicodeType::ENCLOSING_MARK:
case UnicodeType::COMBINING_SPACING_MARK:
return BASE_FORM|PRINTABLE;
// Print
case UnicodeType::SPACE_SEPARATOR:
case UnicodeType::DASH_PUNCTUATION:
case UnicodeType::INITIAL_PUNCTUATION:
case UnicodeType::FINAL_PUNCTUATION:
case UnicodeType::CONNECTOR_PUNCTUATION:
case UnicodeType::OTHER_PUNCTUATION:
case UnicodeType::MATH_SYMBOL:
case UnicodeType::CURRENCY_SYMBOL:
case UnicodeType::MODIFIER_SYMBOL:
case UnicodeType::OTHER_SYMBOL:
return PRINTABLE;
// Control
case UnicodeType::CONTROL:
case UnicodeType::FORMAT:
return CONTROL;
case UnicodeType::LINE_SEPARATOR:
case UnicodeType::PARAGRAPH_SEPARATOR:
return CONTROL|PRINTABLE;
// for all others
default:
return 0;
}
}