/** \file token_properties.hpp .
-----------------------------------------------------------------------------



           (upper, lower, etc.)

 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.

-----------------------------------------------------------------------------

    \brief  Contains TokenProperties a class encapsulting information about the characters occuring in a token

   Description:

-----------------------------------------------------------------------------


   8/11/00  Initial creation

-------------------------------------------------------------------------- */

#ifndef UIMA_TOKEN_PROPERTIES_HPP
#define UIMA_TOKEN_PROPERTIES_HPP

/* ----------------------------------------------------------------------- */
/*       Include dependencies                                              */
/* ----------------------------------------------------------------------- */

#include "uima/pragmas.hpp" // must be first file to be included to get pragmas
// extended assert: must be before system includes to make sure our version is used
#include "uima/assertmsg.h"
#include "unicode/unistr.h"
#include "uima/types.h"
#include <string>

/* ----------------------------------------------------------------------- */
/*       Constants                                                         */
/* ----------------------------------------------------------------------- */

/* ----------------------------------------------------------------------- */
/*       Forward declarations                                              */
/* ----------------------------------------------------------------------- */

namespace uima {
#ifndef UIMA_STRPTRLENPAIR_HPP
  class UnicodeStringRef;
#endif
}

/* ----------------------------------------------------------------------- */
/*       Types / Classes                                                   */
/* ----------------------------------------------------------------------- */

namespace uima {

  /**
     The class <TT>TokenProperties</TT> is used to encapsulate information about
     the characters occuring in a token (for example, upper and lower).
     At the centre it is a bitset, but with inline member functions
     for convenient access.
     This has to be filled by each compliant tokenizer and stored
     with each token.
     Example:
     \code
     \endcode
     @see
  */
  class UIMA_LINK_IMPORTSPEC TokenProperties {
  public:
    /** @name Constructors */
    /*@{*/
    /// Constructs an object, initializing all bit values to zero.
    TokenProperties( void );
    /// Constructs an object from a UString, computing the bit values for the string.
    TokenProperties( const icu::UnicodeString & ustrInputString);
    /// Constructs an object from a UString, computing the bit values for the string.
    TokenProperties( const UnicodeStringRef & ulstrInputString);
    /** Constructs an object from a two pointers, computing the bit values for the string.
     * Note: cpucEnd points beyond the end of the string
     */
    TokenProperties(
      const UChar * cpucCurrent,
      const UChar * cpucEnd);
    /**
     * initializes bits to value of <TT>w32Val</TT>
     */
    TokenProperties( WORD32 w32Val );
    /*@}*/
    /** @name Properties */
    /*@{*/
    /// true if the first char in the token is upper case
    bool hasLeadingUpper( void ) const;
    /// sets the <TT>hasLeadingUpper()</TT> property to <TT>bSetOn</TT>
    void setLeadingUpper( bool bSetOn = true );

    /// true if some char after the first char in the token is upper case
    bool hasTrailingUpper( void ) const;
    /// sets the <TT>hasTrailingUpper()</TT> property to <TT>bSetOn</TT>
    void setTrailingUpper( bool bSetOn = true );

    /// true if the token has upper case chars (leading or trailing)
    bool hasUpper( void ) const;

    /// true if the token has lower case chars
    bool hasLower( void ) const;
    /// sets the <TT>hasLower()</TT> property to <TT>bSetOn</TT>
    void setLower( bool bSetOn = true );

    /// true if the token has numeric chars
    bool hasNumeric( void ) const;
    /// sets the <TT>hasNumeric()</TT> property to <TT>bSetOn</TT>
    void setNumeric( bool bSetOn = true);

    /// true if the token has special chars (e.g. hyphen, period etc.)
    bool hasSpecial( void ) const;
    /// sets the <TT>hasSpecial()</TT> property to <TT>bSetOn</TT>
    void setSpecial( bool bSetOn = true );
    /*@}*/
    /** @name Miscellaneous */
    /*@{*/
    /// true if not hasSpecial() and not hasNumeric()
    bool isPlainWord() const;

    /// true if only hasUpper()
    bool isAllUppercaseWord( void ) const;
    /// true if only hasLower()
    bool isAllLowercaseWord( void ) const;
    /// true if only hasLeadingUpper() and hasTrailingUpper()
    bool isInitialUppercaseWord( void ) const;
    /** true if hasNumeric() && !(hasLower() || hasUpper())
     *  Note: this might have decimal point and sign
     */
    bool isPlainNumber() const;

    /** unlike isPlainNumber() this only allows for digits (no sign and point)
     */
    bool isPureNumber() const;

    /** true if hasSpecail() && !(hasLower() || hasUpper() || hasNumeric())
     *  Note: this might have decimal point and sign
     */
    bool isPureSpecial() const;

    /// Resets all bits in *this, and returns *this
    void reset( void );

    /// Resets all bits and reinitializes from the string
    void
    initFromString(
      const UChar * cpucCurrent,
      const UChar * cpucEnd
    );

    /**
       Returns an object of type string, N characters long.
       Each position in the new string is initialized with a character
       ('0' for zero and '1' for one), representing the value stored in the
       corresponding bit position of this.
       Character position N - 1 corresponds to bit position 0.
       Subsequent decreasing character positions correspond to increasing
       bit positions. */
    string to_string( void ) const;

    /// Returns the integral value corresponding to the bits in *this.
    unsigned long to_ulong( void ) const;

    /*@}*/
  protected:
    /* --- functions -------------------------------------------------------- */
    /* --- variables -------------------------------------------------------- */
  private:
    /* --- functions -------------------------------------------------------- */
    /* --- variables -------------------------------------------------------- */
    WORD32  iv_w32Bits;

  }
  ; /* TokenProperties */

#define UIMA_TOKEN_PROP_LEADING_UPPER   1
#define UIMA_TOKEN_PROP_TRAILING_UPPER  2
#define UIMA_TOKEN_PROP_LOWER           4
#define UIMA_TOKEN_PROP_NUMERIC         8
#define UIMA_TOKEN_PROP_SPECIAL        16

  /* ----------------------------------------------------------------------- */
  /*       Inline Functions                                                  */
  /* ----------------------------------------------------------------------- */


  inline TokenProperties::TokenProperties( void ) :
      iv_w32Bits(0) {}
  inline TokenProperties::TokenProperties( WORD32 w32Val ) :
      iv_w32Bits(w32Val) {}
  inline bool TokenProperties::hasTrailingUpper( void )  const {
    return((iv_w32Bits & UIMA_TOKEN_PROP_TRAILING_UPPER) != 0);
  }
  inline void TokenProperties::setTrailingUpper( bool bSetOn ) {
    if (bSetOn) {
      iv_w32Bits |= UIMA_TOKEN_PROP_TRAILING_UPPER;
    } else {
      iv_w32Bits &= (~UIMA_TOKEN_PROP_TRAILING_UPPER);
    }
  }

  inline bool TokenProperties::hasLeadingUpper( void ) const {
    return((iv_w32Bits & UIMA_TOKEN_PROP_LEADING_UPPER) != 0);
  }
  inline void TokenProperties::setLeadingUpper( bool bSetOn ) {
    if (bSetOn) {
      iv_w32Bits |= UIMA_TOKEN_PROP_LEADING_UPPER;
    } else {
      iv_w32Bits &= (~UIMA_TOKEN_PROP_LEADING_UPPER);
    }
  }
  inline bool TokenProperties::hasUpper( void )  const {
    return((iv_w32Bits & (UIMA_TOKEN_PROP_LEADING_UPPER | UIMA_TOKEN_PROP_TRAILING_UPPER)) != 0);
  }

  inline bool TokenProperties::hasLower( void ) const {
    return((iv_w32Bits & UIMA_TOKEN_PROP_LOWER) != 0);
  }
  inline void TokenProperties::setLower( bool bSetOn) {
    if (bSetOn) {
      iv_w32Bits |= UIMA_TOKEN_PROP_LOWER;
    } else {
      iv_w32Bits &= (~UIMA_TOKEN_PROP_LOWER);
    }
  }

  inline bool TokenProperties::hasNumeric( void ) const {
    return((iv_w32Bits & UIMA_TOKEN_PROP_NUMERIC) != 0);
  }
  inline void TokenProperties::setNumeric( bool bSetOn ) {
    if (bSetOn) {
      iv_w32Bits |= UIMA_TOKEN_PROP_NUMERIC;
    } else {
      iv_w32Bits &= (~UIMA_TOKEN_PROP_NUMERIC);
    }
  }

  inline bool TokenProperties::hasSpecial( void ) const {
    return((iv_w32Bits & UIMA_TOKEN_PROP_SPECIAL) != 0);
  }
  inline void TokenProperties::setSpecial( bool bSetOn ) {
    if (bSetOn) {
      iv_w32Bits |= UIMA_TOKEN_PROP_SPECIAL;
    } else {
      iv_w32Bits &= (~UIMA_TOKEN_PROP_SPECIAL);
    }
  }

  inline bool TokenProperties::isPlainWord( void ) const {
    return(   ((iv_w32Bits &  (UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LOWER | UIMA_TOKEN_PROP_LEADING_UPPER)) != 0)
              && ((iv_w32Bits & ~(UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LOWER | UIMA_TOKEN_PROP_LEADING_UPPER)) == 0));
  }
  inline bool TokenProperties::isAllUppercaseWord( void ) const {
    return(   ((iv_w32Bits &  (UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LEADING_UPPER)) != 0)
              && ((iv_w32Bits & ~(UIMA_TOKEN_PROP_TRAILING_UPPER | UIMA_TOKEN_PROP_LEADING_UPPER)) == 0));
  }
  inline bool TokenProperties::isAllLowercaseWord( void ) const {
    return(iv_w32Bits  == UIMA_TOKEN_PROP_LOWER);
  }
  inline bool TokenProperties::isInitialUppercaseWord( void ) const {
    return( iv_w32Bits == (UIMA_TOKEN_PROP_LOWER | UIMA_TOKEN_PROP_LEADING_UPPER));
  }
  inline bool TokenProperties::isPlainNumber() const {
    return(   ((iv_w32Bits &  (UIMA_TOKEN_PROP_NUMERIC)) != 0)
              && ((iv_w32Bits & ~(UIMA_TOKEN_PROP_SPECIAL | UIMA_TOKEN_PROP_NUMERIC)) == 0));
  }
  inline bool TokenProperties::isPureNumber() const {
    return(iv_w32Bits  == UIMA_TOKEN_PROP_NUMERIC);
  }
  inline bool TokenProperties::isPureSpecial() const {
    return(iv_w32Bits == UIMA_TOKEN_PROP_SPECIAL);
  }


  inline void TokenProperties::reset( void ) {
    iv_w32Bits = 0;
  }

  inline unsigned long TokenProperties::to_ulong( void ) const {
    return(unsigned long)iv_w32Bits;
  }

}

/* ----------------------------------------------------------------------- */
#endif /* UIMA_TOKEN_PROPERTIES_HPP */

/* <EOF> */




