//////////////////////////////////////////////////////////////////////////////// | |
// | |
// Licensed to the Apache Software Foundation (ASF) under one or more | |
// contributor license agreements. See the NOTICE file distributed with | |
// this work for additional information regarding copyright ownership. | |
// The ASF licenses this file to You under the Apache License, Version 2.0 | |
// (the "License"); you may not use this file except in compliance with | |
// the License. You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
// | |
//////////////////////////////////////////////////////////////////////////////// | |
/* | |
* ToDo: Create ASDoc style comment to generate the API document. | |
*/ | |
package com.adobe.linguistics.utils | |
{ | |
import flash.utils.Dictionary; | |
import flash.text.engine.TextBlock; | |
import flash.text.engine.TextElement; | |
import flash.text.engine.ElementFormat; | |
/** | |
* <p>The <code>TextTokenizer</code> class locates the boundaries of words in a | |
* block of text.</p> | |
* | |
* Word boundary locations are found according to these general principles: | |
* <ul> | |
* <li> Be able to tokenize a block of text specified by start and end positions </li> | |
* <li> Default separator is Unicode white space character. Also break on newlines </li> | |
* <li> Tokens consist of either words or numbers in which case it may include commas, etc.. </li> | |
* <li> Apostrophes or hyphens within a word are kept with the word </li> | |
* <li> Punctuation, spaces and other characters that are not part of a token, are broken out separately </li> | |
* </ul> | |
* <p>In the future versions, this class would also provide a way for the developers to customize the separators used by the tokenizer. </p> | |
* | |
* @playerversion Flash 9.x | |
* @langversion 3.0 | |
*/ | |
public class TextTokenizer implements ITokenizer | |
{ | |
private var _textBlock:TextBlock; | |
private var _textHolder:String; | |
private var _startIndex:int; | |
private var _endIndex:int; | |
private var _firstToken:Token; | |
private var _lastToken:Token; | |
private var _ignoredCharactersDict:Dictionary = new Dictionary(); | |
/** | |
* The tokenizer for a String object. | |
* This class implements the ITokenizer interface. | |
* Constructs a new TextTokenizer object to break String to words by creating with a new piece of text. | |
* @param textHolder A <code>String</code> object to hold the text which will be processed by this tokenizer. | |
* @param startIndex A <code>int</code> type input to hold the starting index of input text should be scanned. | |
* @param endIndex A <code>int</code> type input to hold the ending index of input text should be scanned. | |
* <span class="hide"> TODO param requestedLocaleIDName The LocaleID name to be used by this TextTokenizer object. </span> | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ | |
public function TextTokenizer(textHolder:String, startIndex:int=0, endIndex:int=int.MAX_VALUE)//, requestedLocaleIDName:String=null) | |
{ | |
//requestedLocaleIDName parameter is useful for potential extension. won't handle it in the first round of implementation. | |
// same comments for API: requestedLocaleIDName()/actualLocaleIDName()/getAvailableLocaleIDNames() | |
var textElement:TextElement = new TextElement(textHolder, new ElementFormat()); | |
var textBlock:TextBlock = new TextBlock(); | |
textBlock.content = textElement; | |
/* init a tokenizer object */ | |
this._textBlock = textBlock; | |
this._textHolder = textHolder; | |
this._startIndex = 0; | |
this._endIndex = this._textBlock.content.text.length; | |
initDefaultIgnoredCharacters(); | |
setStartIndex(startIndex); | |
setEndIndex(endIndex); | |
} | |
private function setStartIndex(value:int):void { | |
if ( value <= 0 ) | |
this._startIndex = 0; | |
else if ( value >= this._endIndex ) | |
this._startIndex = this._endIndex; | |
else | |
this._startIndex=value; | |
} | |
// strange behaviour with String.substring() function... need more thinking.... | |
private function setEndIndex(value:int):void { | |
if ( value >= this._textBlock.content.text.length ) | |
this._endIndex = this._textBlock.content.text.length; | |
else if ( value <= this._startIndex ) | |
this._endIndex = this._startIndex; | |
else | |
this._endIndex = value; | |
} | |
private function initDefaultIgnoredCharacters():void { | |
var ignoredCharsArray:Array = [ | |
0x002d, | |
0x2010, | |
0x2011, | |
0x0003, | |
0x0007 | |
]; | |
var ignoredChars:String = ""; | |
for ( var i:int=0; i< ignoredCharsArray.length; ++i ) { | |
ignoredChars=ignoredChars+String.fromCharCode(ignoredCharsArray[i]); | |
} | |
this.ignoredCharacters = ignoredChars; | |
} | |
private function getNextTokenByIndex( startPos:int ):Token{ | |
var resultToken:Token = null; | |
/* calculate first token and return it. */ | |
var i:int = (startPos > this._startIndex) ? startPos: this._startIndex; | |
while ( i< this._endIndex ) { | |
var begin:int = i; | |
i = this._textBlock.findNextWordBoundary(begin); | |
var end:int = ( i <= this._endIndex) ? i : this._endIndex; | |
//trace(this._textHolder.substring(begin,end)); | |
if ( !isSingleSpecialCharacter( this._textHolder.substring(begin,end) ) ) { | |
resultToken = new Token(begin,end); | |
break; | |
} | |
} | |
if ( resultToken==null ) resultToken = this.getLastToken(); | |
return resultToken; | |
} | |
private function getPreviousTokenByIndex( endPos:int):Token { | |
var resultToken:Token = null; | |
/* calculate first token and return it. */ | |
var i:int = (endPos < this._endIndex) ? endPos: this._endIndex; | |
/* special handling for last element in the word, bof */ | |
var specialHandling:Boolean = false; | |
if ( i == this._endIndex ) { | |
specialHandling = true; | |
i = this._endIndex -1; | |
} | |
/* special handling for last element in the word, eof */ | |
while ( i > this._startIndex ) { | |
var end:int = i; | |
i = this._textBlock.findPreviousWordBoundary(end); | |
var begin:int = ( i > this._startIndex) ? i : this._startIndex; | |
/* special handling for last element in the word, bof */ | |
if ( specialHandling ) { | |
end = (this._textBlock.findNextWordBoundary(begin)<this._endIndex) ?this._textBlock.findNextWordBoundary(begin):this._endIndex; | |
specialHandling=false; | |
if ( (end != this._endIndex) && !isSingleSpecialCharacter(this._textHolder.substring(this._endIndex-1,this._endIndex)) ) { | |
begin = this._endIndex-1; | |
i=begin; | |
end = this._endIndex; | |
} | |
} | |
/* special handling for last element in the word, eof */ | |
if ( !isSingleSpecialCharacter( this._textHolder.substring(begin,end) ) ) { | |
resultToken = new Token(begin,end); | |
break; | |
} | |
} | |
if ( resultToken==null ) resultToken = this.getFirstToken(); | |
return resultToken; | |
} | |
private function isExceptionCharacter(word:String):Boolean { | |
if ( word.length != 1 ) return false; | |
if ( this._ignoredCharactersDict[word] == true ) return true; | |
return false; | |
} | |
private function getNextFilteredTokenByIndex(startPos:int):Token { | |
var token:Token = getNextTokenByIndex(startPos); | |
var firstToken:Token = token; | |
var cursor:int=token.last+1; | |
while ( (cursor < this._endIndex) ) { | |
if ( !isExceptionCharacter(this._textHolder.substring(cursor-1,cursor)) ) { | |
break; | |
}else { | |
//another request from Harish about handling case abc\\abc abc\.abc case...not 100% sure about the correct behavior... | |
/*bof*/ | |
while( cursor < this._endIndex && isExceptionCharacter(this._textHolder.substring(cursor-1,cursor)) ) { | |
cursor++; | |
} | |
cursor--; | |
/*eof*/ | |
} | |
token = getNextTokenByIndex(cursor); | |
if ( token.first != cursor ) { | |
token = firstToken; | |
break; | |
} | |
token.first=firstToken.first; | |
firstToken = token; | |
cursor = token.last+1; | |
} | |
return token; | |
} | |
private function getPreviousFilteredTokenByIndex(endPos:int):Token { | |
var token:Token = getPreviousTokenByIndex(endPos); | |
var lastToken:Token = token; | |
var cursor:int=token.first-1; | |
while ( ( cursor > this._startIndex ) ) { | |
if ( !isExceptionCharacter(this._textHolder.substring(cursor,cursor+1)) ) { | |
break; | |
}else { | |
//another request from Harish about handling case abc\\abc abc\.abc case...not 100% sure about the correct behavior... | |
/*bof*/ | |
while( cursor > this._startIndex && isExceptionCharacter(this._textHolder.substring(cursor,cursor+1)) ) { | |
cursor--; | |
} | |
cursor++; | |
/*eof*/ | |
} | |
token = getPreviousTokenByIndex(cursor); | |
if ( token.last != cursor ) { | |
token = lastToken; | |
break; | |
} | |
token.last=lastToken.last; | |
lastToken = token; | |
cursor = token.first-1; | |
} | |
return token; | |
} | |
private function isSingleSpecialCharacter(word:String):Boolean{ | |
if ( word.length != 1 ) return false; | |
if ( word.toLocaleLowerCase() == word.toLocaleUpperCase() ) return true; | |
return false; | |
} | |
/** | |
* Set all of ignored separators to this tokenizer class. | |
* | |
* A vector of int containing all of ignored separators code point which are used by this class. | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ | |
public function set ignoredSeparators(characters:Vector.<int>):void { | |
if ( characters == null || characters.length==0 ) return; | |
this._ignoredCharactersDict = new Dictionary(); | |
for ( var i:int =0;i<characters.length;++i) { | |
this._ignoredCharactersDict[String.fromCharCode(characters[i])]=true; | |
} | |
} | |
/** | |
* Get all of ignored separators used by this tokenizer class. | |
* | |
* A vector of int containing all of ignored separators code point which are used by this class. | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ | |
public function get ignoredSeparators():Vector.<int>{ | |
var result:Vector.<int> = new Vector.<int>(); | |
for ( var key:String in _ignoredCharactersDict) { | |
result.push(key.charCodeAt(0) ); | |
} | |
return result; | |
} | |
private function set ignoredCharacters(value:String ) :void { | |
if( value == null || value == "" ) return; | |
var charArr:Array = value.split(""); | |
this._ignoredCharactersDict = new Dictionary(); | |
for ( var i:int = 0;i< charArr.length;++i) { | |
this._ignoredCharactersDict[charArr[i]]=true; | |
} | |
} | |
private function get ignoredCharacters():String { | |
var result:String = ""; | |
for ( var key:String in _ignoredCharactersDict) { | |
result +=key; | |
} | |
return result; | |
} | |
/** | |
* The name of the requested locale ID that was passed to the constructor of this TextTokenizer object. | |
* | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ /* | |
public function get requestedLocaleIDName():String { | |
return null; | |
} | |
/** | |
* The name of the actual locale ID used by this TextTokenizer object. | |
* | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ /* | |
public function get actualLocaleIDName():String { | |
return null; | |
} | |
/** | |
* Lists all of the locale ID names supported by this class. | |
* | |
* A vector of strings containing all of the locale ID names supported by this class. | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ /* | |
public static function getAvailableLocaleIDNames():Vector.<String>{ return null;} | |
*/ | |
/** | |
* Return the first word in the text being scanned. | |
* <p> NOTE: In a special case when there are no valid tokens in text, it returns a pseudo token having first and last index set to int.MAX_VALUE. As a result<code> firstToken().first </code>equals int.MAX_VALUE and<code> firstToken().last </code>equals int.MAX_VALUE.</p> | |
* | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ | |
public function getFirstToken():Token { | |
/* return the cached one. */ | |
if ( this._firstToken != null ) | |
return this._firstToken; | |
/* calculate first token and return it. */ | |
//this._firstToken = getNextTokenByIndex(this._startIndex); // without any filter from LS, directly use FTE tokenizer... | |
this._firstToken = getNextFilteredTokenByIndex(this._startIndex); | |
return this._firstToken; | |
} | |
/** | |
* @private | |
* Return the last word in the text being scanned. | |
* | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ | |
public function getLastToken():Token { | |
/* return the cached one. */ | |
if ( this._lastToken != null ) | |
return this._lastToken; | |
/* calculate last token and return it. */ | |
this._lastToken = new Token(int.MAX_VALUE,int.MAX_VALUE); | |
return this._lastToken; | |
} | |
/** | |
* Determine the next word following the current token. | |
* | |
* <p>Returns the token of the next word.</p><p> NOTE: When there are no more valid tokens, it returns a pseudo token having first and last index set to int.MAX_VALUE. As a result<code> getNextToken().first </code>equals int.MAX_VALUE and<code> getNextToken().last </code>equals int.MAX_VALUE.</p> | |
* @param token A <code>Token</code> object to be used for determining next word. | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ | |
public function getNextToken(token:Token):Token { | |
//return getNextTokenByIndex(token.last); // without any filter from LS, directly use FTE tokenizer... | |
return getNextFilteredTokenByIndex(token.last); | |
} | |
/** | |
* Determine the word preceding the current token. | |
* | |
* <p>Returns the token of the previous word or<code> getFirstToken </code>object if there is no preceding word.</p> | |
* @param token A <code>Token</code> object to be used for determining previous word. | |
* @playerversion Flash 10 | |
* @langversion 3.0 | |
*/ | |
public function getPreviousToken(token:Token):Token { | |
//return getPreviousTokenByIndex( token.first );// without any filter from LS, directly use FTE tokenizer... | |
return getPreviousFilteredTokenByIndex( token.first ) | |
} | |
} | |
} |