src/framework/uima/token_properties.hpp - uima-uimacpp - Git at Google

 /** \file token_properties.hpp .
 -----------------------------------------------------------------------------


            (upper, lower, etc.)

  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.

 -----------------------------------------------------------------------------

     \brief  Contains TokenProperties a class encapsulting information about the characters occuring in a token

    Description:

 -----------------------------------------------------------------------------


    8/11/00  Initial creation

 -------------------------------------------------------------------------- */

 #ifndef UIMA_TOKEN_PROPERTIES_HPP
 #define UIMA_TOKEN_PROPERTIES_HPP

 /* ----------------------------------------------------------------------- */
 /*       Include dependencies                                              */
 /* ----------------------------------------------------------------------- */

 #include "uima/pragmas.hpp" // must be first file to be included to get pragmas
 // extended assert: must be before system includes to make sure our version is used
 #include "uima/assertmsg.h"
 #include "unicode/unistr.h"
 #include "uima/types.h"
 #include <string>

 /* ----------------------------------------------------------------------- */
 /*       Constants                                                         */
 /* ----------------------------------------------------------------------- */

 /* ----------------------------------------------------------------------- */
 /*       Forward declarations                                              */
 /* ----------------------------------------------------------------------- */

 namespace uima {
 #ifndef UIMA_STRPTRLENPAIR_HPP
   class UnicodeStringRef;
 #endif
 }

 /* ----------------------------------------------------------------------- */
 /*       Types / Classes                                                   */
 /* ----------------------------------------------------------------------- */

 namespace uima {

   /**
      The class <TT>TokenProperties</TT> is used to encapsulate information about
      the characters occuring in a token (for example, upper and lower).
      At the centre it is a bitset, but with inline member functions
      for convenient access.
      This has to be filled by each compliant tokenizer and stored
      with each token.
      Example:
      \code
      \endcode
      @see
   */
   class UIMA_LINK_IMPORTSPEC TokenProperties {
   public:
     /** @name Constructors */
     /*@{*/
     /// Constructs an object, initializing all bit values to zero.
     TokenProperties( void );
     /// Constructs an object from a UString, computing the bit values for the string.
     TokenProperties( const icu::UnicodeString & ustrInputString);
     /// Constructs an object from a UString, computing the bit values for the string.
     TokenProperties( const UnicodeStringRef & ulstrInputString);
     /** Constructs an object from a two pointers, computing the bit values for the string.
      * Note: cpucEnd points beyond the end of the string
      */
     TokenProperties(
       const UChar * cpucCurrent,
       const UChar * cpucEnd);
     /**
      * initializes bits to value of <TT>w32Val</TT>
      */
     TokenProperties( WORD32 w32Val );
     /*@}*/
     /** @name Properties */
     /*@{*/
     /// true if the first char in the token is upper case
     bool hasLeadingUpper( void ) const;
     /// sets the <TT>hasLeadingUpper()</TT> property to <TT>bSetOn</TT>
     void setLeadingUpper( bool bSetOn = true );

     /// true if some char after the first char in the token is upper case
     bool hasTrailingUpper( void ) const;
     /// sets the <TT>hasTrailingUpper()</TT> property to <TT>bSetOn</TT>
     void setTrailingUpper( bool bSetOn = true );

     /// true if the token has upper case chars (leading or trailing)
     bool hasUpper( void ) const;

     /// true if the token has lower case chars
     bool hasLower( void ) const;
     /// sets the <TT>hasLower()</TT> property to <TT>bSetOn</TT>
     void setLower( bool bSetOn = true );

     /// true if the token has numeric chars
     bool hasNumeric( void ) const;
     /// sets the <TT>hasNumeric()</TT> property to <TT>bSetOn</TT>
     void setNumeric( bool bSetOn = true);

     /// true if the token has special chars (e.g. hyphen, period etc.)
     bool hasSpecial( void ) const;
     /// sets the <TT>hasSpecial()</TT> property to <TT>bSetOn</TT>
     void setSpecial( bool bSetOn = true );
     /*@}*/
     /** @name Miscellaneous */
     /*@{*/
     /// true if not hasSpecial() and not hasNumeric()
     bool isPlainWord() const;

     /// true if only hasUpper()
     bool isAllUppercaseWord( void ) const;
     /// true if only hasLower()
     bool isAllLowercaseWord( void ) const;
     /// true if only hasLeadingUpper() and hasTrailingUpper()
     bool isInitialUppercaseWord( void ) const;
     /** true if hasNumeric() && !(hasLower() || hasUpper())
      *  Note: this might have decimal point and sign
      */
     bool isPlainNumber() const;

     /** unlike isPlainNumber() this only allows for digits (no sign and point)
      */
     bool isPureNumber() const;

     /** true if hasSpecail() && !(hasLower() || hasUpper() || hasNumeric())
      *  Note: this might have decimal point and sign
      */
     bool isPureSpecial() const;

     /// Resets all bits in *this, and returns *this
     void reset( void );

     /// Resets all bits and reinitializes from the string
     void
     initFromString(
       const UChar * cpucCurrent,
       const UChar * cpucEnd
     );

     /**
        Returns an object of type string, N characters long.
        Each position in the new string is initialized with a character
        ('0' for zero and '1' for one), representing the value stored in the
        corresponding bit position of this.
        Character position N - 1 corresponds to bit position 0.
        Subsequent decreasing character positions correspond to increasing
        bit positions. */
     std::string to_string( void ) const;

     /// Returns the integral value corresponding to the bits in *this.
     unsigned long to_ulong( void ) const;

     /*@}*/
   protected:
     /* --- functions -------------------------------------------------------- */
     /* --- variables -------------------------------------------------------- */
   private:
     /* --- functions -------------------------------------------------------- */
     /* --- variables -------------------------------------------------------- */
     WORD32  iv_w32Bits;

   }
   ; /* TokenProperties */

 #define UIMA_TOKEN_PROP_LEADING_UPPER   1
 #define UIMA_TOKEN_PROP_TRAILING_UPPER  2
 #define UIMA_TOKEN_PROP_LOWER           4
 #define UIMA_TOKEN_PROP_NUMERIC         8
 #define UIMA_TOKEN_PROP_SPECIAL        16

   /* ----------------------------------------------------------------------- */
   /*       Inline Functions                                                  */
   /* ----------------------------------------------------------------------- */


   inline TokenProperties::TokenProperties( void ) :
       iv_w32Bits(0) {}
   inline TokenProperties::TokenProperties( WORD32 w32Val ) :
       iv_w32Bits(w32Val) {}
   inline bool TokenProperties::hasTrailingUpper( void )  const {
     return((iv_w32Bits & UIMA_TOKEN_PROP_TRAILING_UPPER) != 0);
   }
   inline void TokenProperties::setTrailingUpper( bool bSetOn ) {
     if (bSetOn) {
       iv_w32Bits |= UIMA_TOKEN_PROP_TRAILING_UPPER;
     } else {
       iv_w32Bits &= (~UIMA_TOKEN_PROP_TRAILING_UPPER);
     }
   }

   inline bool TokenProperties::hasLeadingUpper( void ) const {
     return((iv_w32Bits & UIMA_TOKEN_PROP_LEADING_UPPER) != 0);
   }
   inline void TokenProperties::setLeadingUpper( bool bSetOn ) {
     if (bSetOn) {
       iv_w32Bits |= UIMA_TOKEN_PROP_LEADING_UPPER;
     } else {
       iv_w32Bits &= (~UIMA_TOKEN_PROP_LEADING_UPPER);
     }
   }
   inline bool TokenProperties::hasUpper( void )  const {
     return((iv_w32Bits & (UIMA_TOKEN_PROP_LEADING_UPPER | UIMA_TOKEN_PROP_TRAILING_UPPER)) != 0);
   }

   inline bool TokenProperties::hasLower( void ) const {
     return((iv_w32Bits & UIMA_TOKEN_PROP_LOWER) != 0);
   }
   inline void TokenProperties::setLower( bool bSetOn) {
     if (bSetOn) {
       iv_w32Bits |= UIMA_TOKEN_PROP_LOWER;
     } else {
       iv_w32Bits &= (~UIMA_TOKEN_PROP_LOWER);
     }
   }

   inline bool TokenProperties::hasNumeric( void ) const {
     return((iv_w32Bits & UIMA_TOKEN_PROP_NUMERIC) != 0);
   }
   inline void TokenProperties::setNumeric( bool bSetOn ) {
     if (bSetOn) {
       iv_w32Bits |= UIMA_TOKEN_PROP_NUMERIC;
     } else {
       iv_w32Bits &= (~UIMA_TOKEN_PROP_NUMERIC);
     }
   }

   inline bool TokenProperties::hasSpecial( void ) const {
     return((iv_w32Bits & UIMA_TOKEN_PROP_SPECIAL) != 0);
   }
   inline void TokenProperties::setSpecial( bool bSetOn ) {
     if (bSetOn) {
       iv_w32Bits |= UIMA_TOKEN_PROP_SPECIAL;
     } else {
       iv_w32Bits &= (~UIMA_TOKEN_PROP_SPECIAL);
     }
   }

   inline bool TokenProperties::isPlainWord( void ) const {
     return(   ((iv_w32Bits &  (UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LOWER | UIMA_TOKEN_PROP_LEADING_UPPER)) != 0)
               && ((iv_w32Bits & ~(UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LOWER | UIMA_TOKEN_PROP_LEADING_UPPER)) == 0));
   }
   inline bool TokenProperties::isAllUppercaseWord( void ) const {
     return(   ((iv_w32Bits &  (UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LEADING_UPPER)) != 0)
               && ((iv_w32Bits & ~(UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LEADING_UPPER)) == 0));
   }
   inline bool TokenProperties::isAllLowercaseWord( void ) const {
     return(iv_w32Bits  == UIMA_TOKEN_PROP_LOWER);
   }
   inline bool TokenProperties::isInitialUppercaseWord( void ) const {
     return( iv_w32Bits == (UIMA_TOKEN_PROP_LOWER | UIMA_TOKEN_PROP_LEADING_UPPER));
   }
   inline bool TokenProperties::isPlainNumber() const {
     return(   ((iv_w32Bits &  (UIMA_TOKEN_PROP_NUMERIC)) != 0)
               && ((iv_w32Bits & ~(UIMA_TOKEN_PROP_SPECIAL | UIMA_TOKEN_PROP_NUMERIC)) == 0));
   }
   inline bool TokenProperties::isPureNumber() const {
     return(iv_w32Bits  == UIMA_TOKEN_PROP_NUMERIC);
   }
   inline bool TokenProperties::isPureSpecial() const {
     return(iv_w32Bits == UIMA_TOKEN_PROP_SPECIAL);
   }


   inline void TokenProperties::reset( void ) {
     iv_w32Bits = 0;
   }

   inline unsigned long TokenProperties::to_ulong( void ) const {
     return(unsigned long)iv_w32Bits;
   }

 }

 /* ----------------------------------------------------------------------- */
 #endif /* UIMA_TOKEN_PROPERTIES_HPP */

 /* <EOF> */
	/** \file token_properties.hpp .
	-----------------------------------------------------------------------------



	(upper, lower, etc.)

	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.

	-----------------------------------------------------------------------------

	\brief Contains TokenProperties a class encapsulting information about the characters occuring in a token

	Description:

	-----------------------------------------------------------------------------


	8/11/00 Initial creation

	-------------------------------------------------------------------------- */

	#ifndef UIMA_TOKEN_PROPERTIES_HPP
	#define UIMA_TOKEN_PROPERTIES_HPP

	/* ----------------------------------------------------------------------- */
	/* Include dependencies */
	/* ----------------------------------------------------------------------- */

	#include "uima/pragmas.hpp" // must be first file to be included to get pragmas
	// extended assert: must be before system includes to make sure our version is used
	#include "uima/assertmsg.h"
	#include "unicode/unistr.h"
	#include "uima/types.h"
	#include <string>

	/* ----------------------------------------------------------------------- */
	/* Constants */
	/* ----------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------- */
	/* Forward declarations */
	/* ----------------------------------------------------------------------- */

	namespace uima {
	#ifndef UIMA_STRPTRLENPAIR_HPP
	class UnicodeStringRef;
	#endif
	}

	/* ----------------------------------------------------------------------- */
	/* Types / Classes */
	/* ----------------------------------------------------------------------- */

	namespace uima {

	/**
	The class <TT>TokenProperties</TT> is used to encapsulate information about
	the characters occuring in a token (for example, upper and lower).
	At the centre it is a bitset, but with inline member functions
	for convenient access.
	This has to be filled by each compliant tokenizer and stored
	with each token.
	Example:
	\code
	\endcode
	@see
	*/
	class UIMA_LINK_IMPORTSPEC TokenProperties {
	public:
	/** @name Constructors */
	/@{/
	/// Constructs an object, initializing all bit values to zero.
	TokenProperties( void );
	/// Constructs an object from a UString, computing the bit values for the string.
	TokenProperties( const icu::UnicodeString & ustrInputString);
	/// Constructs an object from a UString, computing the bit values for the string.
	TokenProperties( const UnicodeStringRef & ulstrInputString);
	/** Constructs an object from a two pointers, computing the bit values for the string.
	* Note: cpucEnd points beyond the end of the string
	*/
	TokenProperties(
	const UChar * cpucCurrent,
	const UChar * cpucEnd);
	/**
	* initializes bits to value of <TT>w32Val</TT>
	*/
	TokenProperties( WORD32 w32Val );
	/@}/
	/** @name Properties */
	/@{/
	/// true if the first char in the token is upper case
	bool hasLeadingUpper( void ) const;
	/// sets the <TT>hasLeadingUpper()</TT> property to <TT>bSetOn</TT>
	void setLeadingUpper( bool bSetOn = true );

	/// true if some char after the first char in the token is upper case
	bool hasTrailingUpper( void ) const;
	/// sets the <TT>hasTrailingUpper()</TT> property to <TT>bSetOn</TT>
	void setTrailingUpper( bool bSetOn = true );

	/// true if the token has upper case chars (leading or trailing)
	bool hasUpper( void ) const;

	/// true if the token has lower case chars
	bool hasLower( void ) const;
	/// sets the <TT>hasLower()</TT> property to <TT>bSetOn</TT>
	void setLower( bool bSetOn = true );

	/// true if the token has numeric chars
	bool hasNumeric( void ) const;
	/// sets the <TT>hasNumeric()</TT> property to <TT>bSetOn</TT>
	void setNumeric( bool bSetOn = true);

	/// true if the token has special chars (e.g. hyphen, period etc.)
	bool hasSpecial( void ) const;
	/// sets the <TT>hasSpecial()</TT> property to <TT>bSetOn</TT>
	void setSpecial( bool bSetOn = true );
	/@}/
	/** @name Miscellaneous */
	/@{/
	/// true if not hasSpecial() and not hasNumeric()
	bool isPlainWord() const;

	/// true if only hasUpper()
	bool isAllUppercaseWord( void ) const;
	/// true if only hasLower()
	bool isAllLowercaseWord( void ) const;
	/// true if only hasLeadingUpper() and hasTrailingUpper()
	bool isInitialUppercaseWord( void ) const;
	/** true if hasNumeric() && !(hasLower() \|\| hasUpper())
	* Note: this might have decimal point and sign
	*/
	bool isPlainNumber() const;

	/** unlike isPlainNumber() this only allows for digits (no sign and point)
	*/
	bool isPureNumber() const;

	/** true if hasSpecail() && !(hasLower() \|\| hasUpper() \|\| hasNumeric())
	* Note: this might have decimal point and sign
	*/
	bool isPureSpecial() const;

	/// Resets all bits in this, and returns this
	void reset( void );

	/// Resets all bits and reinitializes from the string
	void
	initFromString(
	const UChar * cpucCurrent,
	const UChar * cpucEnd
	);

	/**
	Returns an object of type string, N characters long.
	Each position in the new string is initialized with a character
	('0' for zero and '1' for one), representing the value stored in the
	corresponding bit position of this.
	Character position N - 1 corresponds to bit position 0.
	Subsequent decreasing character positions correspond to increasing
	bit positions. */
	std::string to_string( void ) const;

	/// Returns the integral value corresponding to the bits in *this.
	unsigned long to_ulong( void ) const;

	/@}/
	protected:
	/* --- functions -------------------------------------------------------- */
	/* --- variables -------------------------------------------------------- */
	private:
	/* --- functions -------------------------------------------------------- */
	/* --- variables -------------------------------------------------------- */
	WORD32 iv_w32Bits;

	}
	; /* TokenProperties */

	#define UIMA_TOKEN_PROP_LEADING_UPPER 1
	#define UIMA_TOKEN_PROP_TRAILING_UPPER 2
	#define UIMA_TOKEN_PROP_LOWER 4
	#define UIMA_TOKEN_PROP_NUMERIC 8
	#define UIMA_TOKEN_PROP_SPECIAL 16

	/* ----------------------------------------------------------------------- */
	/* Inline Functions */
	/* ----------------------------------------------------------------------- */


	inline TokenProperties::TokenProperties( void ) :
	iv_w32Bits(0) {}
	inline TokenProperties::TokenProperties( WORD32 w32Val ) :
	iv_w32Bits(w32Val) {}
	inline bool TokenProperties::hasTrailingUpper( void ) const {
	return((iv_w32Bits & UIMA_TOKEN_PROP_TRAILING_UPPER) != 0);
	}
	inline void TokenProperties::setTrailingUpper( bool bSetOn ) {
	if (bSetOn) {
	iv_w32Bits \|= UIMA_TOKEN_PROP_TRAILING_UPPER;
	} else {
	iv_w32Bits &= (~UIMA_TOKEN_PROP_TRAILING_UPPER);
	}
	}

	inline bool TokenProperties::hasLeadingUpper( void ) const {
	return((iv_w32Bits & UIMA_TOKEN_PROP_LEADING_UPPER) != 0);
	}
	inline void TokenProperties::setLeadingUpper( bool bSetOn ) {
	if (bSetOn) {
	iv_w32Bits \|= UIMA_TOKEN_PROP_LEADING_UPPER;
	} else {
	iv_w32Bits &= (~UIMA_TOKEN_PROP_LEADING_UPPER);
	}
	}
	inline bool TokenProperties::hasUpper( void ) const {
	return((iv_w32Bits & (UIMA_TOKEN_PROP_LEADING_UPPER \| UIMA_TOKEN_PROP_TRAILING_UPPER)) != 0);
	}

	inline bool TokenProperties::hasLower( void ) const {
	return((iv_w32Bits & UIMA_TOKEN_PROP_LOWER) != 0);
	}
	inline void TokenProperties::setLower( bool bSetOn) {
	if (bSetOn) {
	iv_w32Bits \|= UIMA_TOKEN_PROP_LOWER;
	} else {
	iv_w32Bits &= (~UIMA_TOKEN_PROP_LOWER);
	}
	}

	inline bool TokenProperties::hasNumeric( void ) const {
	return((iv_w32Bits & UIMA_TOKEN_PROP_NUMERIC) != 0);
	}
	inline void TokenProperties::setNumeric( bool bSetOn ) {
	if (bSetOn) {
	iv_w32Bits \|= UIMA_TOKEN_PROP_NUMERIC;
	} else {
	iv_w32Bits &= (~UIMA_TOKEN_PROP_NUMERIC);
	}
	}

	inline bool TokenProperties::hasSpecial( void ) const {
	return((iv_w32Bits & UIMA_TOKEN_PROP_SPECIAL) != 0);
	}
	inline void TokenProperties::setSpecial( bool bSetOn ) {
	if (bSetOn) {
	iv_w32Bits \|= UIMA_TOKEN_PROP_SPECIAL;
	} else {
	iv_w32Bits &= (~UIMA_TOKEN_PROP_SPECIAL);
	}
	}

	inline bool TokenProperties::isPlainWord( void ) const {
	return( ((iv_w32Bits & (UIMA_TOKEN_PROP_TRAILING_UPPER \| UIMA_TOKEN_PROP_LOWER \| UIMA_TOKEN_PROP_LEADING_UPPER)) != 0)
	&& ((iv_w32Bits & ~(UIMA_TOKEN_PROP_TRAILING_UPPER \| UIMA_TOKEN_PROP_LOWER \| UIMA_TOKEN_PROP_LEADING_UPPER)) == 0));
	}
	inline bool TokenProperties::isAllUppercaseWord( void ) const {
	return( ((iv_w32Bits & (UIMA_TOKEN_PROP_TRAILING_UPPER \| UIMA_TOKEN_PROP_LEADING_UPPER)) != 0)
	&& ((iv_w32Bits & ~(UIMA_TOKEN_PROP_TRAILING_UPPER \| UIMA_TOKEN_PROP_LEADING_UPPER)) == 0));
	}
	inline bool TokenProperties::isAllLowercaseWord( void ) const {
	return(iv_w32Bits == UIMA_TOKEN_PROP_LOWER);
	}
	inline bool TokenProperties::isInitialUppercaseWord( void ) const {
	return( iv_w32Bits == (UIMA_TOKEN_PROP_LOWER \| UIMA_TOKEN_PROP_LEADING_UPPER));
	}
	inline bool TokenProperties::isPlainNumber() const {
	return( ((iv_w32Bits & (UIMA_TOKEN_PROP_NUMERIC)) != 0)
	&& ((iv_w32Bits & ~(UIMA_TOKEN_PROP_SPECIAL \| UIMA_TOKEN_PROP_NUMERIC)) == 0));
	}
	inline bool TokenProperties::isPureNumber() const {
	return(iv_w32Bits == UIMA_TOKEN_PROP_NUMERIC);
	}
	inline bool TokenProperties::isPureSpecial() const {
	return(iv_w32Bits == UIMA_TOKEN_PROP_SPECIAL);
	}


	inline void TokenProperties::reset( void ) {
	iv_w32Bits = 0;
	}

	inline unsigned long TokenProperties::to_ulong( void ) const {
	return(unsigned long)iv_w32Bits;
	}

	}

	/* ----------------------------------------------------------------------- */
	#endif /* UIMA_TOKEN_PROPERTIES_HPP */

	/* <EOF> */