blob: fcb67375fe51e6a611803c7afbc3af31a49c15f2 [file] [log] [blame]
/** \file token_properties.hpp .
-----------------------------------------------------------------------------
(upper, lower, etc.)
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
-----------------------------------------------------------------------------
\brief Contains TokenProperties a class encapsulting information about the characters occuring in a token
Description:
-----------------------------------------------------------------------------
8/11/00 Initial creation
-------------------------------------------------------------------------- */
#ifndef UIMA_TOKEN_PROPERTIES_HPP
#define UIMA_TOKEN_PROPERTIES_HPP
/* ----------------------------------------------------------------------- */
/* Include dependencies */
/* ----------------------------------------------------------------------- */
#include "uima/pragmas.hpp" // must be first file to be included to get pragmas
// extended assert: must be before system includes to make sure our version is used
#include "uima/assertmsg.h"
#include "unicode/unistr.h"
#include "uima/types.h"
#include <string>
/* ----------------------------------------------------------------------- */
/* Constants */
/* ----------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- */
/* Forward declarations */
/* ----------------------------------------------------------------------- */
namespace uima {
#ifndef UIMA_STRPTRLENPAIR_HPP
class UnicodeStringRef;
#endif
}
/* ----------------------------------------------------------------------- */
/* Types / Classes */
/* ----------------------------------------------------------------------- */
namespace uima {
/**
The class <TT>TokenProperties</TT> is used to encapsulate information about
the characters occuring in a token (for example, upper and lower).
At the centre it is a bitset, but with inline member functions
for convenient access.
This has to be filled by each compliant tokenizer and stored
with each token.
Example:
\code
\endcode
@see
*/
class UIMA_LINK_IMPORTSPEC TokenProperties {
public:
/** @name Constructors */
/*@{*/
/// Constructs an object, initializing all bit values to zero.
TokenProperties( void );
/// Constructs an object from a UString, computing the bit values for the string.
TokenProperties( const icu::UnicodeString & ustrInputString);
/// Constructs an object from a UString, computing the bit values for the string.
TokenProperties( const UnicodeStringRef & ulstrInputString);
/** Constructs an object from a two pointers, computing the bit values for the string.
* Note: cpucEnd points beyond the end of the string
*/
TokenProperties(
const UChar * cpucCurrent,
const UChar * cpucEnd);
/**
* initializes bits to value of <TT>w32Val</TT>
*/
TokenProperties( WORD32 w32Val );
/*@}*/
/** @name Properties */
/*@{*/
/// true if the first char in the token is upper case
bool hasLeadingUpper( void ) const;
/// sets the <TT>hasLeadingUpper()</TT> property to <TT>bSetOn</TT>
void setLeadingUpper( bool bSetOn = true );
/// true if some char after the first char in the token is upper case
bool hasTrailingUpper( void ) const;
/// sets the <TT>hasTrailingUpper()</TT> property to <TT>bSetOn</TT>
void setTrailingUpper( bool bSetOn = true );
/// true if the token has upper case chars (leading or trailing)
bool hasUpper( void ) const;
/// true if the token has lower case chars
bool hasLower( void ) const;
/// sets the <TT>hasLower()</TT> property to <TT>bSetOn</TT>
void setLower( bool bSetOn = true );
/// true if the token has numeric chars
bool hasNumeric( void ) const;
/// sets the <TT>hasNumeric()</TT> property to <TT>bSetOn</TT>
void setNumeric( bool bSetOn = true);
/// true if the token has special chars (e.g. hyphen, period etc.)
bool hasSpecial( void ) const;
/// sets the <TT>hasSpecial()</TT> property to <TT>bSetOn</TT>
void setSpecial( bool bSetOn = true );
/*@}*/
/** @name Miscellaneous */
/*@{*/
/// true if not hasSpecial() and not hasNumeric()
bool isPlainWord() const;
/// true if only hasUpper()
bool isAllUppercaseWord( void ) const;
/// true if only hasLower()
bool isAllLowercaseWord( void ) const;
/// true if only hasLeadingUpper() and hasTrailingUpper()
bool isInitialUppercaseWord( void ) const;
/** true if hasNumeric() && !(hasLower() || hasUpper())
* Note: this might have decimal point and sign
*/
bool isPlainNumber() const;
/** unlike isPlainNumber() this only allows for digits (no sign and point)
*/
bool isPureNumber() const;
/** true if hasSpecail() && !(hasLower() || hasUpper() || hasNumeric())
* Note: this might have decimal point and sign
*/
bool isPureSpecial() const;
/// Resets all bits in *this, and returns *this
void reset( void );
/// Resets all bits and reinitializes from the string
void
initFromString(
const UChar * cpucCurrent,
const UChar * cpucEnd
);
/**
Returns an object of type string, N characters long.
Each position in the new string is initialized with a character
('0' for zero and '1' for one), representing the value stored in the
corresponding bit position of this.
Character position N - 1 corresponds to bit position 0.
Subsequent decreasing character positions correspond to increasing
bit positions. */
string to_string( void ) const;
/// Returns the integral value corresponding to the bits in *this.
unsigned long to_ulong( void ) const;
/*@}*/
protected:
/* --- functions -------------------------------------------------------- */
/* --- variables -------------------------------------------------------- */
private:
/* --- functions -------------------------------------------------------- */
/* --- variables -------------------------------------------------------- */
WORD32 iv_w32Bits;
}
; /* TokenProperties */
#define UIMA_TOKEN_PROP_LEADING_UPPER 1
#define UIMA_TOKEN_PROP_TRAILING_UPPER 2
#define UIMA_TOKEN_PROP_LOWER 4
#define UIMA_TOKEN_PROP_NUMERIC 8
#define UIMA_TOKEN_PROP_SPECIAL 16
/* ----------------------------------------------------------------------- */
/* Inline Functions */
/* ----------------------------------------------------------------------- */
inline TokenProperties::TokenProperties( void ) :
iv_w32Bits(0) {}
inline TokenProperties::TokenProperties( WORD32 w32Val ) :
iv_w32Bits(w32Val) {}
inline bool TokenProperties::hasTrailingUpper( void ) const {
return((iv_w32Bits & UIMA_TOKEN_PROP_TRAILING_UPPER) != 0);
}
inline void TokenProperties::setTrailingUpper( bool bSetOn ) {
if (bSetOn) {
iv_w32Bits |= UIMA_TOKEN_PROP_TRAILING_UPPER;
} else {
iv_w32Bits &= (~UIMA_TOKEN_PROP_TRAILING_UPPER);
}
}
inline bool TokenProperties::hasLeadingUpper( void ) const {
return((iv_w32Bits & UIMA_TOKEN_PROP_LEADING_UPPER) != 0);
}
inline void TokenProperties::setLeadingUpper( bool bSetOn ) {
if (bSetOn) {
iv_w32Bits |= UIMA_TOKEN_PROP_LEADING_UPPER;
} else {
iv_w32Bits &= (~UIMA_TOKEN_PROP_LEADING_UPPER);
}
}
inline bool TokenProperties::hasUpper( void ) const {
return((iv_w32Bits & (UIMA_TOKEN_PROP_LEADING_UPPER | UIMA_TOKEN_PROP_TRAILING_UPPER)) != 0);
}
inline bool TokenProperties::hasLower( void ) const {
return((iv_w32Bits & UIMA_TOKEN_PROP_LOWER) != 0);
}
inline void TokenProperties::setLower( bool bSetOn) {
if (bSetOn) {
iv_w32Bits |= UIMA_TOKEN_PROP_LOWER;
} else {
iv_w32Bits &= (~UIMA_TOKEN_PROP_LOWER);
}
}
inline bool TokenProperties::hasNumeric( void ) const {
return((iv_w32Bits & UIMA_TOKEN_PROP_NUMERIC) != 0);
}
inline void TokenProperties::setNumeric( bool bSetOn ) {
if (bSetOn) {
iv_w32Bits |= UIMA_TOKEN_PROP_NUMERIC;
} else {
iv_w32Bits &= (~UIMA_TOKEN_PROP_NUMERIC);
}
}
inline bool TokenProperties::hasSpecial( void ) const {
return((iv_w32Bits & UIMA_TOKEN_PROP_SPECIAL) != 0);
}
inline void TokenProperties::setSpecial( bool bSetOn ) {
if (bSetOn) {
iv_w32Bits |= UIMA_TOKEN_PROP_SPECIAL;
} else {
iv_w32Bits &= (~UIMA_TOKEN_PROP_SPECIAL);
}
}
inline bool TokenProperties::isPlainWord( void ) const {
return( ((iv_w32Bits & (UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LOWER | UIMA_TOKEN_PROP_LEADING_UPPER)) != 0)
&& ((iv_w32Bits & ~(UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LOWER | UIMA_TOKEN_PROP_LEADING_UPPER)) == 0));
}
inline bool TokenProperties::isAllUppercaseWord( void ) const {
return( ((iv_w32Bits & (UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LEADING_UPPER)) != 0)
&& ((iv_w32Bits & ~(UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LEADING_UPPER)) == 0));
}
inline bool TokenProperties::isAllLowercaseWord( void ) const {
return(iv_w32Bits == UIMA_TOKEN_PROP_LOWER);
}
inline bool TokenProperties::isInitialUppercaseWord( void ) const {
return( iv_w32Bits == (UIMA_TOKEN_PROP_LOWER | UIMA_TOKEN_PROP_LEADING_UPPER));
}
inline bool TokenProperties::isPlainNumber() const {
return( ((iv_w32Bits & (UIMA_TOKEN_PROP_NUMERIC)) != 0)
&& ((iv_w32Bits & ~(UIMA_TOKEN_PROP_SPECIAL | UIMA_TOKEN_PROP_NUMERIC)) == 0));
}
inline bool TokenProperties::isPureNumber() const {
return(iv_w32Bits == UIMA_TOKEN_PROP_NUMERIC);
}
inline bool TokenProperties::isPureSpecial() const {
return(iv_w32Bits == UIMA_TOKEN_PROP_SPECIAL);
}
inline void TokenProperties::reset( void ) {
iv_w32Bits = 0;
}
inline unsigned long TokenProperties::to_ulong( void ) const {
return(unsigned long)iv_w32Bits;
}
}
/* ----------------------------------------------------------------------- */
#endif /* UIMA_TOKEN_PROPERTIES_HPP */
/* <EOF> */