| #ifndef UIMA_LANGUAGE_HPP |
| #define UIMA_LANGUAGE_HPP |
| /** \file language.hpp . |
| ----------------------------------------------------------------------------- |
| |
| |
| |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| |
| ----------------------------------------------------------------------------- |
| |
| \brief Contains the Language class |
| |
| Description: |
| |
| ----------------------------------------------------------------------------- |
| |
| |
| 6/10/1999 Initial creation |
| |
| -------------------------------------------------------------------------- */ |
| |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Include dependencies */ |
| /* ----------------------------------------------------------------------- */ |
| |
| #include "uima/pragmas.hpp" // must be first file to be included to get pragmas |
| |
| #include "uima/assertmsg.h" |
| #include "uima/unistrref.hpp" |
| #include "apr_general.h" // For strcasecmp |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Constants */ |
| /* ----------------------------------------------------------------------- */ |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Forward declarations */ |
| /* ----------------------------------------------------------------------- */ |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Types / Classes */ |
| /* ----------------------------------------------------------------------- */ |
| |
| namespace uima { |
| /** |
| The class <TT>Language</TT> models languages in UIMACPP. |
| A language is specified as a string holding a 2-character language and |
| an optional 2-character territory, i.e. as "ll-cc" or "ll". |
| |
| String representation of the simple language sub-part is according to |
| ISO standard 639 "Codes for the representation of the names of languages". |
| |
| String representation of the territory sub-part is according to |
| ISO standard 3166 "Codes for the Representation of Names of Countries". |
| |
| String representation of the full language object is according to |
| IANA RFC 1766 "Tags for the Identification of Languages": <TT><LANG>-<SUBTAG></TT> |
| |
| There is also an internal technical numeric representation of a |
| language as a 4 byte number (32 bit, high-word language value and low-word |
| territory value). Conversion to or from numeric representation is provided |
| via a constructor or conversion operator. |
| |
| The class distinguishes between unspecified and invalid |
| languages and territories. For example, in "en" the territory and sub-language is |
| valid, but unspecified, as opposed to "en-US" where the |
| territory and sub-language is specified as "US". However, in "en-FOO" the |
| territory and sub-language is invalid as there is no such territory code. |
| |
| Because of this, there is more than one way for two language objects to be |
| compatible with each other: |
| |
| <OL> |
| <LI> Full identity: language and sub-language/territory are identical |
| (for example, en-US == en-us). |
| |
| <LI> Match: the language part is identical and the sub-language and territory of |
| at least one of the objects is set to unspecified |
| (for example, en matches en-US). |
| |
| <LI> Match ignoring territory: the language part is identical, both |
| sub-languages are specified but they are different from each other |
| (for example, en-US matches-without-territory en-AU). |
| </OL> |
| |
| Match type 2 is used if a annotator specifies that it can deal with |
| any kind of english text and is not limited, or specialized to |
| US-English. |
| |
| Match type 3 is not supported. |
| |
| \code |
| Language clLanguage(argv[1]); |
| if(!clLanguage.isValid()) { |
| // abort with error |
| //... |
| } |
| if (! ( clLanguage.matches("en") |
| || clLanguage.matches("de") ) ) { |
| // abort with error |
| //... |
| } |
| \endcode |
| |
| Note: As the class is simple, compiler generated copy constructor |
| and the assignment operator can be used. |
| */ |
| class UIMA_LINK_IMPORTSPEC Language { |
| public: |
| /** @name Language constants and types */ |
| /** |
| Special constants for the invalid & unspecified languages |
| */ |
| static char const * INVALID; |
| static char const * UNSPECIFIED; |
| |
| /** A typedef for representing a languages as a numeric value */ |
| typedef long TyLanguageAsNumber; |
| /*@{*/ |
| |
| /** @name Constructors */ |
| /*@{*/ |
| /** Default Constructor: Language::UNSPECIFIED |
| */ |
| Language (void); |
| |
| /** Constructor from a C string. |
| String must have form <TT>language_territory</TT>. For example, "en-US" |
| or just language "en". |
| */ |
| Language ( |
| const TCHAR * cpszLanguageCode |
| ); |
| |
| /** Constructor from a single-byte string (std::string). |
| String must have form <TT>language_territory</TT>. For example, "en-US" |
| or just language "en". |
| */ |
| Language ( |
| const std::string & languageCode |
| ); |
| |
| /** Constructor from a ICU Unicode string. |
| String must have form <TT>language_territory</TT>. For example, "en-US" |
| or just language "en". |
| */ |
| Language ( |
| const icu::UnicodeString & languageCode |
| ); |
| |
| /** Constructor from a UnicodeStringRef. |
| String must have form <TT>language_territory</TT>. For example, "en-US" |
| or just language "en". |
| */ |
| Language ( |
| const UnicodeStringRef & languageCode |
| ); |
| |
| /** |
| Constructor from a 32 bit representation of a language (see asNumber) |
| */ |
| Language ( |
| TyLanguageAsNumber ulLanguageAsLong |
| ); |
| /*@}*/ |
| |
| /** @name Match functions */ |
| /*@{*/ |
| |
| /** Returns TRUE, if this language is identical to the specified language. */ |
| bool |
| operator== ( |
| const Language & crclObject |
| ) const; |
| |
| /** Returns TRUE, if this language is identical to the specified language. */ |
| bool |
| operator!= ( |
| const Language & crclObject |
| ) const; |
| |
| /** Returns TRUE, if this language code sorts before the specified language. */ |
| bool |
| operator< ( |
| const Language & crclOther |
| ) const; |
| |
| /** Returns TRUE, if the languages are identical and |
| either the territories are equal or one is unspecified, |
| or if one of the languages is unspecified. |
| */ |
| bool |
| matches( |
| const Language & crclCompareLang |
| ) const; |
| /*@}*/ |
| |
| /** @name Miscellaneous */ |
| /*@{*/ |
| /** |
| Returns TRUE if language is valid (territory may be missing) |
| */ |
| bool |
| isValid(void) const; |
| |
| /** |
| Get just the 2-character language part, or an empty string if unspecified. |
| */ |
| const char * |
| getLanguageCode(void) const; |
| |
| /** |
| Get a numeric form of just the language (2-characters in top 2-bytes) |
| */ |
| TyLanguageAsNumber |
| getLanguage(void) const; |
| |
| /** |
| Returns TRUE if language has been specified |
| */ |
| bool |
| hasLanguage(void) const; |
| |
| /** |
| Get just the 2-character territory part, or an empty string if unspecified. |
| */ |
| const char * |
| getTerritoryCode(void) const; |
| |
| /** |
| Get a numeric form of just the territory (2-characters in bottom 2-bytes) |
| */ |
| TyLanguageAsNumber |
| getTerritory(void) const; |
| |
| /** |
| Returns TRUE if territory has been specified |
| */ |
| bool |
| hasTerritory(void) const; |
| |
| /** |
| Sets the value according to string <TT>crclString</TT>. |
| <TT>crclString</TT> must have the form <TT><LANG_ID>[-<TERR_ID>]</TT>. |
| */ |
| void |
| setValue( |
| const std::string & crclString |
| ); |
| |
| /** Returns the object in the form <LANGUAGE_CODE>-<TERRITORY_CODE>. |
| For example, en-US. |
| */ |
| std::string |
| asString( void ) const; |
| /** Returns the object in the form <LANGUAGE_CODE>-<TERRITORY_CODE>. |
| For example, en-US. |
| */ |
| icu::UnicodeString |
| asUnicodeString( void ) const; |
| /** Returns the object as a 4-byte "number" |
| (actually just the 4 character bytes, e.g. x656e7472 'enus') |
| */ |
| TyLanguageAsNumber |
| asNumber(void) const; |
| |
| /*@}*/ |
| protected: |
| /* --- functions -------------------------------------------------------- */ |
| /* --- variables -------------------------------------------------------- */ |
| private: |
| /* --- functions -------------------------------------------------------- */ |
| |
| /// Used in ctors to init from a string. |
| void |
| _initFromString( |
| const std::string & crstrLanguageCode |
| ); |
| /// Used in ctors to init from a Unicode string. |
| void |
| _initFromString( |
| UnicodeStringRef crustrLanguageCode |
| ); |
| /// Used in ctors to init from a C string. |
| void |
| init( |
| const char* cpszLanguageCode |
| ); |
| |
| /* --- variables -------------------------------------------------------- */ |
| char iv_locale[6]; |
| char iv_lang[3]; |
| |
| } |
| ; /* Language */ |
| |
| |
| inline Language::Language() { |
| init(Language::UNSPECIFIED); |
| } |
| |
| inline Language::Language( |
| std::string const & crstrLanguageCode |
| ) { |
| _initFromString(crstrLanguageCode); |
| } |
| |
| inline Language::Language( |
| icu::UnicodeString const & crustrLanguageCode |
| ) { |
| _initFromString(UnicodeStringRef(crustrLanguageCode)); |
| } |
| |
| inline Language::Language( |
| UnicodeStringRef const & crustrLanguageCode |
| ) { |
| _initFromString(crustrLanguageCode); |
| } |
| |
| inline Language::Language( |
| TCHAR const * cpszLanguageCode |
| ) { |
| init(cpszLanguageCode); |
| } |
| |
| inline bool |
| Language::operator== ( |
| const Language & crclObject |
| ) const { |
| // Both language & territory must match |
| return (strcasecmp(iv_locale, crclObject.iv_locale) == 0); |
| } |
| |
| inline bool |
| Language::operator!= ( |
| const Language & crclObject |
| ) const { |
| return (strcasecmp(iv_locale, crclObject.iv_locale) != 0); |
| } |
| |
| inline bool |
| Language::operator< ( |
| const Language & crclOther |
| ) const { |
| // Used to order by language & territory enums ... full string should be OK |
| return (strcasecmp(iv_locale, crclOther.iv_locale) < 0); |
| } |
| |
| inline void |
| Language::setValue( |
| const std::string & crclString |
| ) { |
| _initFromString(crclString); |
| } |
| |
| inline std::string |
| Language::asString( void ) const { |
| return (iv_locale); |
| } |
| |
| inline icu::UnicodeString |
| Language::asUnicodeString( void ) const { |
| // this ctor does invariant chars only but it is fast |
| return icu::UnicodeString(iv_locale, ""); |
| } |
| |
| inline Language::TyLanguageAsNumber |
| Language::asNumber(void) const { |
| unsigned long num; |
| const unsigned char * uc = (const unsigned char *) iv_locale; |
| num = uc[0] << 24 | uc[1] << 16 | uc[3] << 8 | uc[4]; |
| return (TyLanguageAsNumber)num; |
| } |
| |
| inline const char * |
| Language::getLanguageCode(void) const { |
| return iv_lang; |
| } |
| |
| inline Language::TyLanguageAsNumber |
| Language::getLanguage(void) const { |
| // Replace old enum by 4-byte "number" with 2-char language in top 2 bytes |
| int num = iv_lang[0] << 24 | iv_lang[1] << 16; |
| return num; |
| } |
| |
| inline bool |
| Language::hasLanguage(void) const { |
| return (iv_locale[0] != '\0'); |
| } |
| |
| inline const char * |
| Language::getTerritoryCode(void) const { |
| return &iv_locale[3]; |
| } |
| |
| inline Language::TyLanguageAsNumber |
| Language::getTerritory(void) const { |
| // Replace old enum by 4-byte "number" with 2-char territory in bottom 2 bytes |
| int num = iv_locale[3] << 8 | iv_locale[4]; |
| return num; |
| } |
| |
| inline bool |
| Language::isValid(void) const { |
| return (iv_locale[0] != '?'); |
| } |
| |
| inline bool |
| Language::hasTerritory(void) const { |
| return (iv_locale[2] != '\0'); |
| } |
| |
| /* ----------------------------------------------------------------------- */ |
| |
| } // namespace uima |
| |
| #endif /* UIMA_LANGUAGE_HPP */ |
| /* <EOF> */ |
| |