blob: eb747129e295321a36cad96c8dda82c404e1d04b [file] [log] [blame]
#ifndef UIMA_ANNOTATOR_TOK_H$
#define UIMA_ANNOTATOR_TOK_H$
/** \file annotator_tok.hpp .
-----------------------------------------------------------------------------
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
\brief Contains AnnotatorTokenizer a Unicode UIMA Tokenizer Annotator.
-------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- */
/* Include dependencies */
/* ----------------------------------------------------------------------- */
/* We want the timers in this annotator to be only active if the annotator specific
define ANNOTATOR_TIMERS is set (e.g. in the automake.pro file for this annotator)
in all other cases we don't want the timers.
Since all the timers depend on the more generic define DEBUG_TIMING we
define DEBUG_TIMING if and only if ANNOTATOR_TIMERS is set.
Specificaly we don't want the timers set when DEBUG_TIMING is defined to
build a generic timing driver of the whole system.
Our internal annotator timers would bias the whole timing driver with the
overhead involved in calling them in this annotator. This is why we specificaly
undefine DEBUG_TIMING even if it might be set in the makefile to build this
annotator.
If you want timing in this annotator use ANNOTATOR_TIMERS not DEBUG_TIMING
*/
#ifdef ANNOTATOR_TIMERS
# ifndef DEBUG_TIMING
# define DEBUG_TIMING
# endif
#else
# ifdef DEBUG_TIMING
# undef DEBUG_TIMING
# endif
#endif
#include "uima/timedatetools.hpp"
#include "uima/api.hpp" /* UIMA API */
///////#include "uima/u2cpcnvrtbuff.hpp" /* U2CpConvertBuffer */
#include "uima/ss_tokenizer.hpp"
#include "uima/internal_casimpl.hpp"
#define STEMMER_BUF_LEN 50
using namespace uima;
/** @name AnnotatorTokenizer
The class <TT>AnnotatorTokenizer</TT> is used a universal Unicode Tokenizer.
It uses a little trick to check API consistency via an abstract base class,
without having the overhead of virtual functions in our ship version.
<TT>AnnotatorABase</TT> defines all non-static member functions a plug-in needs
to define as pure virtual functions. By making this class inherit from
this base class we can make sure that compilation will fail if the
interfaces change.
Since we don't really use the inheritance relationship we don't define
it in the ship version.
Example:
\code
\endcode
@see AnnotatorABase
*/
class AnnotatorTokenizer : public Tokenizer , public TextAnnotator {
public:
/** @name Constructors */
/*@{*/
/** Default Constructor.
*/
AnnotatorTokenizer(void);
/*@}*/
virtual ~AnnotatorTokenizer(void); //lint !e1509: base class destructor for class 'AnnotatorABase' is not virtual
/** @name Annotator Processing Functions */
/*@{*/
/** call the UIMA Annotator to initialize itself based on a UIMA engine
and a UIMA Configuration section and return a UIMA error code */
TyErrorId
initialize(
AnnotatorContext & rclAnnotatorContext
); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::init(Engine &, ConfigAnnotator &) (line 79, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
TyErrorId typeSystemInit(TypeSystem const &);
/** call the UIMA Annotator to deinitialize itself based on a UIMA engine
and return a UIMA error code */
TyErrorId
destroy(); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::deInit(Engine &) (line 83, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
/** call the UIMA Annotator to reconfigure itself based on a UIMA Configuration
section and return a UIMA error code */
TyErrorId
reconfigure(
); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::config(ConfigAnnotator &) (line 87, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
protected:
/** call the UIMA Annotator to perform its doc related duty based on a UIMA engine
and return a UIMA error code */
TyErrorId
process(
CAS &,
const ResultSpecification & crclTargetSet
); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::processDocument(Engine &, const TargetSetAT &, const TargetSetTT &) (line 91, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
/*@}*/
protected:
virtual int tokenCallback( unsigned long ulLocation, unsigned long ulLength,
TokenProperties & crclTokenProperties,
bool bNewPara, bool bNewSent );
TyDocIndex iv_uiParagraphStartIndex;
TyDocIndex iv_uiSentenceStartIndex;
// segment numbers
size_t iv_uiTokenNbr;
size_t iv_uiSentenceNbr;
size_t iv_uiParagraphNbr;
private:
// number of the first token/sentence/paragraph
const size_t iv_cuiCOUNTER_START;
/* --------------------------------------------------*/
/* config values we use */
/* --------------------------------------------------*/
/// Enum listing all the config option we support
enum EnAnnotatorConfigOptions {
enConfigOption_TokenNumbersIncludeStopwords,
enConfigOption_UseRelativeTokenAndSentenceNumbers,
enConfigOption_IgnorePunctuationTokens,
// (drop inifile support) enConfigOption_CharMapConfigFilename,
enNumberOfConfigOptions // must be last in enum
};
// our config table
static const ConfigOptionInfo::StOptionInfo cv_astConfigOptionInfo[enNumberOfConfigOptions];
// Variables the config options are stored in:
// if this is true the token numbers are counted including stopwords
bool iv_bTokenNumbersIncludeStopwords;
// if this is true token and sentence number are reset to 1
// for each new sentence/paragraph
bool iv_bUseRelativeTokenAndSentenceNumbers;
// If true, punctuation tokens are ignored
bool iv_bIgnorePunctuationTokens;
// trace component ID
uima::TyComponentId iv_iTraceCompID;
// Some pointers for quick access to UIMA objects. Initialized in init()
uima::internal::CASImpl * iv_pCASImpl;
lowlevel::FSHeap * iv_pFSHeap;
lowlevel::IndexRepository * iv_pIndexRepository;
// FSTypes and corresponding sizes
lowlevel::TyFSType iv_tyTokenType;
lowlevel::TyFeatureOffset iv_tyTokenTypeSize;
lowlevel::TyFSType iv_tySentenceType;
lowlevel::TyFeatureOffset iv_tySentenceTypeSize;
lowlevel::TyFSType iv_tyParagraphType;
lowlevel::TyFeatureOffset iv_tyParagraphTypeSize;
// FSFeatures
lowlevel::TyFeatureOffset iv_tySofaFeatureOffset;
lowlevel::TyFeatureOffset iv_tyBeginPositionFeatureOffset;
lowlevel::TyFeatureOffset iv_tyEndPositionFeatureOffset;
lowlevel::TyFeatureOffset iv_tyTokenPropertiesFeatureOffset;
lowlevel::TyFeatureOffset iv_tyTokenNbrFeatureOffset;
lowlevel::TyFeatureOffset iv_tySentenceNbrFeatureOffset;
lowlevel::TyFeatureOffset iv_tyParagraphNbrFeatureOffset;
lowlevel::TyFSFeature iv_stemFeature;
// needed output types
bool iv_bIsTokenReq;
bool iv_bIsSentenceReq;
bool iv_bIsParagraphReq;
bool iv_stemsRequired;
TokenProperties iv_clTokenProperties;
#ifdef DEBUG_TIMING
uima::Timer iv_clTotalTimer;
uima::Timer iv_clSSTokTimer;
uima::Timer iv_clUimaAnCreateTimer;
uima::Timer iv_clUimaAnSetValTimer;
#endif
/* --- functions --- */
/* COPY CONSTRUCTOR NOT SUPPORTED */
AnnotatorTokenizer(const AnnotatorTokenizer & ); //lint !e1704
/* ASSIGNMENT OPERATOR NOT SUPPORTED */
AnnotatorTokenizer & operator=(const AnnotatorTokenizer & crclObject);
/// (Re-) Access config values. Used in init() and config().
TyErrorId
getConfigValues(
AnnotatorContext & rclConfig
);
/// member functions for adding annotations
void
addNewTokenAnnotation(
TyDocIndex tyBeginPos,
TyDocIndex tyEndPos
);
/// member functions for adding annotations
void
addNewSentenceAnnotation(
TyDocIndex tyBeginPos,
TyDocIndex tyEndPos
);
/// member functions for adding annotations
void
addNewParagraphAnnotation(
TyDocIndex tyBeginPos,
TyDocIndex tyEndPos
);
#if defined(DEBUG_TIMING)
void
dumpTimingData( void ) const;
#endif
}
; /* AnnotatorTokenizer */
/* ----------------------------------------------------------------------- */
#endif /* UIMA_ANNOTATOR_TOK_H */
/* <EOF> */