| /** |
| |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| |
| ----------------------------------------------------------------------------- |
| |
| Description: A Unicode UIMA Tokenizer Annotator. |
| |
| -------------------------------------------------------------------------- */ |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Include dependencies */ |
| /* ----------------------------------------------------------------------- */ |
| |
| // this is included ONCE for the main source file of each binary |
| #include "uima/annotator_tok.hpp" |
| #include "uima/ss_tokenizer.hpp" // sentsep for Uima |
| #include "uima/tt_types.hpp" |
| |
| #include "uima/assertmsg.h" |
| #include "uima/macros.h" |
| #include "uima/trace.hpp" |
| #include "uima/comp_ids.h" /* for trace */ |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Globals */ |
| /* ----------------------------------------------------------------------- */ |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Config values */ |
| /* ----------------------------------------------------------------------- */ |
| |
| // Const table with all the option we access in the config file |
| // The sequence of entries herer must match the enum EnAnnotatorConfigOptions |
| // in the hpp file. |
| const ConfigOptionInfo::StOptionInfo |
| AnnotatorTokenizer::cv_astConfigOptionInfo[] = { |
| { |
| "TokenNumbersIncludeStopwords", //cpszOptionName |
| ConfigOptionInfo::enValueType_Boolean, //enValueType |
| false, //bOptionIsMultiValued |
| 0, //uiNbrOfValuesRequired |
| "true", //cpszDefaultValueAsString |
| "If true token numbers are counted including stopwords" //cpszComment |
| }, |
| { |
| "UseRelativeTokenAndSentenceNumbers", //cpszOptionName |
| ConfigOptionInfo::enValueType_Boolean, //enValueType |
| false, //bOptionIsMultiValued |
| 0, //uiNbrOfValuesRequired |
| "false", //cpszDefaultValueAsString |
| "If true token and sentence numbers are reset to 1 for each new sentence/paragraph" //cpszComment |
| }, |
| { |
| "IgnorePunctuationTokens", //cpszOptionName |
| ConfigOptionInfo::enValueType_Boolean, //enValueType |
| false, //bOptionIsMultiValued |
| 0, //uiNbrOfValuesRequired |
| "false", //cpszDefaultValueAsString |
| "If true, punctuation tokens are ignored" //cpszComment |
| } |
| }; |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Implementation */ |
| /* ----------------------------------------------------------------------- */ |
| |
| /** Default Constructor. |
| */ |
| AnnotatorTokenizer::AnnotatorTokenizer(void) : |
| iv_uiParagraphStartIndex(0), |
| iv_uiSentenceStartIndex(0), |
| iv_uiTokenNbr(0), |
| iv_uiSentenceNbr(0), |
| iv_uiParagraphNbr(0), |
| iv_cuiCOUNTER_START(1), |
| iv_bTokenNumbersIncludeStopwords(true), |
| iv_bUseRelativeTokenAndSentenceNumbers(false), |
| iv_bIgnorePunctuationTokens(false), |
| iv_iTraceCompID(UIMA_TRACE_COMPID_ANNOTATOR_DEFAULT), |
| iv_pCASImpl(NULL), |
| iv_pFSHeap(NULL), |
| iv_tyTokenType(0), |
| iv_tyTokenTypeSize(0), |
| iv_tySentenceType(0), |
| iv_tySentenceTypeSize(0), |
| iv_tyParagraphType(0), |
| iv_tyParagraphTypeSize(0), |
| iv_tySofaFeatureOffset(0), |
| iv_tyBeginPositionFeatureOffset(0), |
| iv_tyEndPositionFeatureOffset(0), |
| iv_tyTokenNbrFeatureOffset(0), |
| iv_tySentenceNbrFeatureOffset(0), |
| iv_tyParagraphNbrFeatureOffset(0), |
| iv_stemFeature(0), |
| iv_bIsTokenReq(false), |
| iv_bIsSentenceReq(false), |
| iv_bIsParagraphReq(false), |
| iv_stemsRequired(false) |
| #ifdef DEBUG_TIMING |
| , |
| iv_clTotalTimer(), |
| iv_clSSTokTimer(), |
| iv_clUimaAnCreateTimer(), |
| iv_clUimaAnSetValTimer() |
| #endif |
| {} |
| |
| AnnotatorTokenizer::~AnnotatorTokenizer(void) { |
| util::Trace clTrace(util::enTraceDetailLow, UIMA_TRACE_ORIGIN, iv_iTraceCompID); |
| // set pointer to NULL to make clear this object did not own them |
| iv_pFSHeap = NULL; |
| } |
| |
| TyErrorId |
| AnnotatorTokenizer::getConfigValues(AnnotatorContext & rANC) { |
| // this must be done before any trace call |
| iv_iTraceCompID = rANC.getTraceCompId(); |
| |
| util::Trace clTrace(util::enTraceDetailLow, UIMA_TRACE_ORIGIN, iv_iTraceCompID); |
| |
| // make sure we have an entry in our config value table for each enum |
| assert(NUMBEROF(cv_astConfigOptionInfo) == enNumberOfConfigOptions); |
| |
| TyErrorId utErrID; |
| |
| // Get Option: token numbers are counted including stopwords? |
| utErrID = extractConfigOptionBoolean( |
| rANC, |
| cv_astConfigOptionInfo[enConfigOption_TokenNumbersIncludeStopwords], |
| iv_bTokenNumbersIncludeStopwords |
| ); |
| if (utErrID != UIMA_ERR_NONE) { |
| return utErrID; |
| } |
| |
| // Get Option: token and sentence number are reset to 1 for each new sentence/paragraph |
| utErrID = extractConfigOptionBoolean( |
| rANC, |
| cv_astConfigOptionInfo[enConfigOption_UseRelativeTokenAndSentenceNumbers], |
| iv_bUseRelativeTokenAndSentenceNumbers |
| ); |
| if (utErrID != UIMA_ERR_NONE) { |
| return utErrID; |
| } |
| |
| // Get Option: If true, punctuation tokens are ignored |
| utErrID = extractConfigOptionBoolean( |
| rANC, |
| cv_astConfigOptionInfo[enConfigOption_IgnorePunctuationTokens], |
| iv_bIgnorePunctuationTokens |
| ); |
| if (utErrID != UIMA_ERR_NONE) { |
| return utErrID; |
| } |
| |
| return UIMA_ERR_NONE; |
| } |
| |
| TyErrorId |
| AnnotatorTokenizer::initialize( |
| AnnotatorContext & rclAnnotatorContext |
| ) { |
| util::Trace clTrace(util::enTraceDetailLow, UIMA_TRACE_ORIGIN, iv_iTraceCompID); |
| return UIMA_ERR_NONE; |
| } |
| |
| TyErrorId AnnotatorTokenizer::typeSystemInit(TypeSystem const & typeSystem) { |
| |
| if (getConfigValues(getAnnotatorContext()) != UIMA_ERR_NONE) { |
| return UIMA_ERR_USER_ANNOTATOR_COULD_NOT_INIT; |
| } |
| |
| lowlevel::TypeSystem const & crTypeSystem = uima::lowlevel::TypeSystem::promoteTypeSystem(typeSystem); |
| |
| iv_tyTokenType = crTypeSystem.getTypeByName( TT::TYPE_NAME_TOKEN_ANNOTATION ); |
| iv_tySentenceType = crTypeSystem.getTypeByName( TT::TYPE_NAME_SENTENCE_ANNOTATION ); |
| iv_tyParagraphType = crTypeSystem.getTypeByName( TT::TYPE_NAME_PARAGRAPH_ANNOTATION ); |
| iv_tySentenceTypeSize = crTypeSystem.getFeatureNumber( iv_tySentenceType ); |
| iv_tyParagraphTypeSize = crTypeSystem.getFeatureNumber( iv_tyParagraphType ); |
| iv_tyTokenTypeSize = crTypeSystem.getFeatureNumber( iv_tyTokenType ); |
| iv_tySofaFeatureOffset = crTypeSystem.getFeatureOffset( uima::internal::gs_tySofaRefFeature ); |
| iv_tyBeginPositionFeatureOffset = crTypeSystem.getFeatureOffset( uima::internal::gs_tyBeginPosFeature ); |
| iv_tyEndPositionFeatureOffset = crTypeSystem.getFeatureOffset( uima::internal::gs_tyEndPosFeature ); |
| iv_tyTokenPropertiesFeatureOffset = crTypeSystem.getFeatureOffset( crTypeSystem.getFeatureByBaseName( iv_tyTokenType, TT::FEATURE_BASE_NAME_TOKEN_PROPERTIES ) ); |
| iv_tyTokenNbrFeatureOffset = crTypeSystem.getFeatureOffset( crTypeSystem.getFeatureByBaseName( iv_tyTokenType, TT::FEATURE_BASE_NAME_TOKEN_NUMBER ) ); |
| iv_tySentenceNbrFeatureOffset = crTypeSystem.getFeatureOffset( crTypeSystem.getFeatureByBaseName( iv_tySentenceType, TT::FEATURE_BASE_NAME_SENTENCE_NUMBER ) ); |
| iv_tyParagraphNbrFeatureOffset = crTypeSystem.getFeatureOffset( crTypeSystem.getFeatureByBaseName( iv_tyParagraphType, TT::FEATURE_BASE_NAME_PARAGRAPH_NUMBER ) ); |
| iv_stemFeature = crTypeSystem.getFeatureByBaseName( iv_tyTokenType, "stem"); |
| return (TyErrorId)UIMA_ERR_NONE; |
| } |
| |
| /** call the UIMA Annotator to deinitialize itself based on a UIMA engine |
| and return a UIMA error code */ |
| TyErrorId |
| AnnotatorTokenizer::destroy() { |
| util::Trace clTrace(util::enTraceDetailLow, UIMA_TRACE_ORIGIN, iv_iTraceCompID); |
| |
| #if defined(DEBUG_TIMING) |
| dumpTimingData(); |
| #endif |
| |
| resetCharClasses(); |
| |
| return (TyErrorId)UIMA_ERR_NONE; |
| } |
| |
| #if defined(DEBUG_TIMING) |
| void AnnotatorTokenizer::dumpTimingData( void ) const { |
| cout << "----------------------------------------------------------------" << endl; |
| cout << " AnnotatorTokenizer: " << endl; |
| cout << " total amount of time:" << iv_clTotalTimer.timeString() << endl; |
| ; |
| cout << " Sentsep : " << (iv_clSSTokTimer-iv_clUimaAnCreateTimer-iv_clUimaAnSetValTimer).timeAndPercentString(iv_clTotalTimer) << endl; |
| ; |
| cout << " Create ANs : " << iv_clUimaAnCreateTimer.timeAndPercentString(iv_clTotalTimer) << endl; |
| ; |
| cout << " Set AN Values : " << iv_clUimaAnSetValTimer.timeAndPercentString(iv_clTotalTimer) << endl; |
| ; |
| cout << "----------------------------------------------------------------" << endl; |
| } |
| #endif |
| |
| /** call the UIMA Annotator to reconfigure itself based on a UIMA Configuration |
| section and return a UIMA error code */ |
| TyErrorId |
| AnnotatorTokenizer::reconfigure( |
| ) { |
| util::Trace clTrace(util::enTraceDetailLow, UIMA_TRACE_ORIGIN, iv_iTraceCompID); |
| resetCharClasses(); |
| if (getConfigValues(getAnnotatorContext()) != UIMA_ERR_NONE) { |
| return UIMA_ERR_USER_ANNOTATOR_COULD_NOT_INIT; |
| } |
| return (TyErrorId)UIMA_ERR_NONE; |
| } |
| |
| inline void |
| AnnotatorTokenizer::addNewTokenAnnotation( |
| TyDocIndex tyBeginPos, |
| TyDocIndex tyEndPos |
| ) { |
| // If we should filter punctuation chars (in TSE mode we have to do that!) |
| if ( iv_bIgnorePunctuationTokens |
| && (tyEndPos == tyBeginPos+1) // length == 1 |
| && iv_clTokenProperties.hasSpecial() |
| ) { |
| return; |
| } |
| |
| // Create new Annotation of type Token |
| lowlevel::TyFS tyNewToken = iv_pFSHeap->createFSWithSize( iv_tyTokenType, iv_tyTokenTypeSize ); |
| |
| // Set the TokenNumber Attribute for the new Token Annotation |
| iv_pFSHeap->setFeatureWithOffset( tyNewToken, iv_tySofaFeatureOffset, lowlevel::FSHeap::getAsFS((int) iv_pCASImpl->getSofaNum()) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewToken, iv_tyBeginPositionFeatureOffset, lowlevel::FSHeap::getAsFS((int) tyBeginPos) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewToken, iv_tyEndPositionFeatureOffset, lowlevel::FSHeap::getAsFS((int) tyEndPos) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewToken, iv_tyTokenNbrFeatureOffset, lowlevel::FSHeap::getAsFS( (int) iv_uiTokenNbr) ); |
| |
| unsigned long ulTokProp = iv_clTokenProperties.to_ulong(); |
| // **why this?** assert( sizeof(unsigned long) <= sizeof(lowlevel::TyFS) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewToken, iv_tyTokenPropertiesFeatureOffset, (lowlevel::TyFS) ulTokProp); |
| |
| // add to index |
| iv_pIndexRepository->add(tyNewToken); |
| ++iv_uiTokenNbr; |
| } |
| |
| inline void |
| AnnotatorTokenizer::addNewSentenceAnnotation( |
| TyDocIndex tyBeginPos, |
| TyDocIndex tyEndPos |
| ) { |
| if ( (!iv_bIsSentenceReq) |
| || (tyEndPos == tyBeginPos+1) // length == 1 |
| ) { |
| return; |
| } |
| |
| // Create new Annotation of type Sentence |
| lowlevel::TyFS tyNewSentence = iv_pFSHeap->createFSWithSize( iv_tySentenceType, iv_tySentenceTypeSize ); |
| |
| // Set the SentenceNumber Attribute for the new Sentence Annotation |
| iv_pFSHeap->setFeatureWithOffset( tyNewSentence, iv_tySofaFeatureOffset, lowlevel::FSHeap::getAsFS((int) iv_pCASImpl->getSofaNum()) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewSentence, iv_tyBeginPositionFeatureOffset, lowlevel::FSHeap::getAsFS((int)tyBeginPos) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewSentence, iv_tyEndPositionFeatureOffset, lowlevel::FSHeap::getAsFS((int)tyEndPos) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewSentence, iv_tySentenceNbrFeatureOffset, lowlevel::FSHeap::getAsFS( (int) iv_uiSentenceNbr) ); |
| |
| // add to index |
| iv_pIndexRepository->add(tyNewSentence); |
| } |
| |
| inline void |
| AnnotatorTokenizer::addNewParagraphAnnotation( |
| TyDocIndex tyBeginPos, |
| TyDocIndex tyEndPos |
| ) { |
| if ( (!iv_bIsParagraphReq) |
| || (tyEndPos == tyBeginPos+1) // length == 1 |
| ) { |
| return; |
| } |
| // Create new Annotation of type Paragrah |
| lowlevel::TyFS tyNewParagraph = iv_pFSHeap->createFSWithSize( iv_tyParagraphType, iv_tyParagraphTypeSize ); |
| |
| // Set the ParagraphNumber Attribute for the new Paragraph Annotation |
| iv_pFSHeap->setFeatureWithOffset( tyNewParagraph, iv_tySofaFeatureOffset, lowlevel::FSHeap::getAsFS((int) iv_pCASImpl->getSofaNum()) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewParagraph, iv_tyBeginPositionFeatureOffset, lowlevel::FSHeap::getAsFS((int)tyBeginPos) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewParagraph, iv_tyEndPositionFeatureOffset, lowlevel::FSHeap::getAsFS((int)tyEndPos) ); |
| iv_pFSHeap->setFeatureWithOffset( tyNewParagraph, iv_tyParagraphNbrFeatureOffset, lowlevel::FSHeap::getAsFS( (int) iv_uiParagraphNbr) ); |
| |
| // add to index |
| iv_pIndexRepository->add(tyNewParagraph); |
| } |
| |
| int AnnotatorTokenizer::tokenCallback( |
| unsigned long location, |
| unsigned long length, |
| TokenProperties & rclTokenProperties, |
| bool bNewPara, |
| bool bNewSent |
| ) { |
| // compute begin and end index of new token |
| TyDocIndex uiTokenStartIndex = (TyDocIndex) location; |
| TyDocIndex uiTokenEndIndex = uiTokenStartIndex + (TyDocIndex) (length); |
| iv_clTokenProperties = rclTokenProperties; |
| |
| if ( (bNewPara || bNewSent) && iv_bIsSentenceReq) { |
| |
| // new paragaph means no explicit new sentence - end current sentence anyway |
| addNewSentenceAnnotation(iv_uiSentenceStartIndex, uiTokenStartIndex ); |
| |
| // new sentence starts at beginning of current word |
| iv_uiSentenceStartIndex = uiTokenStartIndex; |
| ++iv_uiSentenceNbr; |
| |
| // reset token number |
| if (iv_bUseRelativeTokenAndSentenceNumbers) { |
| iv_uiTokenNbr = iv_cuiCOUNTER_START; |
| } |
| } |
| |
| // new paragraph started |
| if ( bNewPara && iv_bIsParagraphReq) { |
| |
| // new paragraph is from StartParagraphIndex to EndLastWordIndex |
| addNewParagraphAnnotation(iv_uiParagraphStartIndex, uiTokenStartIndex ); |
| |
| // new start of paragraph is at beginning of current word |
| iv_uiParagraphStartIndex = uiTokenStartIndex; |
| ++iv_uiParagraphNbr; |
| |
| // reset sentence number |
| if (iv_bUseRelativeTokenAndSentenceNumbers) { |
| iv_uiSentenceNbr = iv_cuiCOUNTER_START; |
| assert( iv_uiTokenNbr == iv_cuiCOUNTER_START ); |
| } |
| } |
| |
| // finally, mark new token |
| addNewTokenAnnotation(uiTokenStartIndex, uiTokenEndIndex ); |
| |
| return 0; |
| } |
| |
| /** call the UIMA Annotator to perform its duty based on a UIMA engine |
| and return a UIMA error code */ |
| TyErrorId |
| AnnotatorTokenizer::process( |
| CAS & tcas, |
| const ResultSpecification & crclTargetSet |
| ) { |
| util::Trace clTrace(util::enTraceDetailMedium, UIMA_TRACE_ORIGIN, iv_iTraceCompID); |
| |
| iv_pCASImpl = & uima::internal::CASImpl::promoteCAS(tcas); |
| iv_pFSHeap = & iv_pCASImpl->getHeap(); |
| iv_pIndexRepository = & iv_pCASImpl->getIndexRepository(); |
| |
| |
| UIMA_TIMING(iv_clTotalTimer.start()); |
| |
| #ifdef DEBUG_VERBOSE |
| UIMA_TPRINT( "ResultSpec" ); |
| crclTargetSet.print(cout); |
| #endif |
| |
| uima::lowlevel::TypeSystem const & lolTS = iv_pCASImpl->getHeap().getTypeSystem(); |
| Type tokType = uima::internal::FSPromoter::promoteType( iv_tyTokenType, lolTS); |
| Type sentType = uima::internal::FSPromoter::promoteType( iv_tySentenceType, lolTS); |
| Type parType = uima::internal::FSPromoter::promoteType( iv_tyParagraphType, lolTS); |
| Feature stemFeature = uima::internal::FSPromoter::promoteFeature( iv_stemFeature, lolTS ); |
| |
| // Check the target AT set: can contain token/sentence/paragraph |
| iv_bIsTokenReq = crclTargetSet.shouldBeCreatedByAnnotator( tokType ); |
| iv_bIsSentenceReq = crclTargetSet.shouldBeCreatedByAnnotator( sentType ); |
| iv_bIsParagraphReq= crclTargetSet.shouldBeCreatedByAnnotator( parType ); |
| iv_stemsRequired = crclTargetSet.shouldBeCreatedByAnnotator( stemFeature ); |
| |
| // if none of them is required, why are we beeing called? |
| assert(iv_bIsTokenReq || iv_bIsSentenceReq || iv_bIsParagraphReq); |
| |
| iv_uiTokenNbr = iv_cuiCOUNTER_START; |
| iv_uiSentenceNbr = iv_cuiCOUNTER_START; |
| iv_uiParagraphNbr = iv_cuiCOUNTER_START; |
| iv_uiParagraphStartIndex = 0; |
| iv_uiSentenceStartIndex = 0; |
| |
| // setLanguage(tcas.getDocumentAnnotation().getLanguage()); |
| |
| UnicodeStringRef ulStrDoc(tcas.getDocumentText()); |
| |
| if (ulStrDoc.length() == 0) { |
| return(TyErrorId)UIMA_ERR_NONE; |
| } |
| |
| UIMA_TIMING(iv_clSSTokTimer.start()); |
| Tokenizer::process( ulStrDoc.getBuffer(), ulStrDoc.getBuffer()+ulStrDoc.length()-1 ); |
| UIMA_TIMING(iv_clSSTokTimer.stop()); |
| |
| // |
| // terminate both sentence and paragraph up to the end of the document |
| // |
| if (iv_uiSentenceStartIndex < ulStrDoc.length()-1) { |
| addNewSentenceAnnotation(iv_uiSentenceStartIndex, ulStrDoc.length() ); |
| } |
| |
| if (iv_uiParagraphStartIndex < ulStrDoc.length()-1) { |
| addNewParagraphAnnotation(iv_uiParagraphStartIndex, ulStrDoc.length() ); |
| } |
| UIMA_TIMING(iv_clTotalTimer.stop()); |
| |
| return(TyErrorId)UIMA_ERR_NONE; |
| } |
| |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Mapping for generic C API wrapper */ |
| /* ----------------------------------------------------------------------- */ |
| |
| typedef AnnotatorTokenizer UserDefinedAnnotator; |
| // define for error/exception info in annotator_generic.inl |
| #define UIMA_ANNOTATOR_NAME "annotator_tok" |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Include generic C API wrapper */ |
| /* ----------------------------------------------------------------------- */ |
| |
| //#include "uima/annotator_generic.inl" |
| MAKE_AE(AnnotatorTokenizer); |
| /* <EOF> */ |
| |