blob: 553a3d3cd8c886d608e78f366dfa01050b169d4a [file] [log] [blame]
/** \file ss_tokenizer.hpp .
-----------------------------------------------------------------------------
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
-------------------------------------------------------------------------- */
#ifndef _INCLUDE_UIMASS
#define _INCLUDE_UIMASS
#include "uima/language.hpp"
#include "uima/token_properties.hpp"
namespace uima {
class ResourceABR;
static const int MAXWARD = 6;
typedef unsigned short TyCharmap [MAXWARD+1][256];
/**character types used in our char map*/
typedef enum {
CH_INVALID = 0,
CH_LWR = 1, // lowercase characters
CH_UPR = 2, // uppercase characters
CH_NUM = 4, // number or currency symbol
CH_USC = 8, // underscore: like a character, no upper/lower information
CH_PRD = 16, // period (full stop)
CH_SND = 32, // sentence end: '?' and '!'
CH_BLK = 64, // blank
CH_NWL = 128, // newline
CH_SPC = 256, // special character (or whitespace)
CH_CWS = 512, // conditional whitespace: if character is between two
// alphanumeric characters, then it becomes part of
// the word, e.g / @ -
// if not, it's treated as a whitespace
CH_NSP = 1024, // number seperator ':' and ',' part of the number
// if between digits
CH_APS = 2048, // apostroph
CH_NPA = 4096, // new paragraph
CH_CUR = 8192 // currency and degree symbol: part of number if after of before digit
}
EnCharClass;
#define CHAR_CLASS_IS_TOKEN_PART(x) ((x) < CH_PRD)
/** @name Tokenizer
The class <TT>Tokenizer</TT> is the implementation of an universal Unicode
Tokenizer which is used in the UIMA tokenizer annotator.
@see AnnotatorTokenizer
*/
class Tokenizer {
public:
/** Default Constructor.
*/
Tokenizer( void );
virtual ~Tokenizer();
/// Main tokenization function
void process( const UChar *cpszStart, const UChar *cpszEnd );
/// Specify language to use (needed for stopword recognition only)
void setLanguage( const Language & crclLanguage );
/// Callback function triggered on token recognition
virtual int tokenCallback( unsigned long ulLocation,
unsigned long ulLength,
TokenProperties & crclTokenProperties,
bool bNewPara, bool bNewSent ) = 0;
EnCharClass getCharClass(UChar c);
// change the character class for a code point
void setCharClass(WORD16 uiUnicodeCodePoint,
EnCharClass enCharClass);
// reset char class table to initial values
void resetCharClasses(void);
protected:
int tokenEntry( const UChar *, size_t ulLocation,
size_t ulLength,
TokenProperties & crclTokenProperties,
bool &bNewPara, bool &bNewSent,
size_t & rulNewlines);
private:
bool isAbreviation(const UChar * pw16String, size_t uiLength) const;
EnCharClass getCharClassInl( UChar c );
// get character class to a character
bool iv_bUseAlternateTerritories;
Language iv_clLanguageABR;
ResourceABR * iv_pclResourceABR;
// this will either point to our constant static map or
// to a freshly allocated writable map if setCharClass has been called
TyCharmap * iv_pauiCharMapWard;
};
} // namespace uima
#endif /* _INCLUDE_UIMASS */