blob: bb9704f2e247008fcb8b4a8828d067c3aaa8bacb [file] [log] [blame]
////////////////////////////////////////////////////////////////////////////////
//
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////
package com.adobe.linguistics.spelling.utils
{
public class Tokenizer
{
private var _data:String;
private var _current:uint;
public function Tokenizer(inText:String)
{
_data = inText;
_current = 0;
}
public function next():Token
{
var first:uint;
var last:uint;
if (_current==_data.length) return null;
while (isSeparator(_data.charAt(_current))) {
_current++;
if (_current==_data.length) return null;
}
first = _current;
while (!isSeparator(_data.charAt(_current))) {
_current++;
if (_current==_data.length) break;
}
last = _current;
// Special handling for single quote
var charFirst:Number = _data.charCodeAt(first);
var charLast:Number = _data.charCodeAt(last-1);
if ((charFirst == 39) || (charFirst == 0x2018) || (charFirst == 0x2019)) first++;
if ((charLast == 39) || (charLast == 0x2018) || (charLast == 0x2019)) last--;
return new Token(first, last);
}
private static var allValidChars:Array = [
{startingChar:65, endingChar:90}, /*Basic Latin bof */
{startingChar:97, endingChar:122},/*Basic Latin eof */
{startingChar:39, endingChar:39}, /* "'" character*/
{startingChar:0x2018, endingChar:0x2019}, /* "‘" and "’" character*/
{startingChar:192, endingChar:214},/* Latin-1 supplement bof */
{startingChar:216, endingChar:246},
{startingChar:248, endingChar:255},/* Latin-1 supplement eof */
{startingChar:256, endingChar:383},/* Lating Extended-A bof-eof European Latin*/
{startingChar:384, endingChar:447}, /* Latin extended-B bof-eof */
{startingChar:48, endingChar:57}, /* number */
{startingChar:536, endingChar:537} /* "ş" character, for romanian */
];
private static function isValidCharacter( inChar:int ) :Boolean {
for ( var i:int = 0; i < allValidChars.length; ++i ) {
if ( (inChar >= allValidChars[i].startingChar) && (inChar <= allValidChars[i].endingChar) )
return true;
}
return false;
}
public static function isSeparator(inChar:String):Boolean
{
var ccode:Number = inChar.charCodeAt();
if ( isValidCharacter( ccode ) )
return false;
return true;
}
}
}