blob: be656eeec1a2377065427e18debd9963ba83602e [file] [log] [blame]
using J2N;
using System.Globalization;
namespace Lucene.Net.Analysis.Miscellaneous
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A BreakIterator-like API for iterating over subwords in text, according to <see cref="WordDelimiterFilter"/> rules.
/// @lucene.internal
/// </summary>
public sealed class WordDelimiterIterator
{
/// <summary>
/// Indicates the end of iteration </summary>
public const int DONE = -1;
public static readonly byte[] DEFAULT_WORD_DELIM_TABLE = LoadDefaultWordDelimTable();
internal char[] text;
private int length;
/// <summary>
/// start position of text, excluding leading delimiters </summary>
private int startBounds;
/// <summary>
/// end position of text, excluding trailing delimiters </summary>
private int endBounds;
/// <summary>
/// Beginning of subword </summary>
internal int current;
/// <summary>
/// End of subword </summary>
internal int end;
/// <summary>does this string end with a possessive such as 's</summary>
private bool hasFinalPossessive = false;
/// <summary>
/// If false, causes case changes to be ignored (subwords will only be generated
/// given SUBWORD_DELIM tokens). (Defaults to true)
/// </summary>
private readonly bool splitOnCaseChange;
/// <summary>
/// If false, causes numeric changes to be ignored (subwords will only be generated
/// given SUBWORD_DELIM tokens). (Defaults to true)
/// </summary>
private readonly bool splitOnNumerics;
/// <summary>
/// If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
/// <p/>
/// "O'Neil's" => "O", "Neil"
/// </summary>
private readonly bool stemEnglishPossessive;
private readonly byte[] charTypeTable;
/// <summary>
/// if true, need to skip over a possessive found in the last call to next() </summary>
private bool skipPossessive = false;
// TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
// done if separated by these chars?) "," would be an obvious candidate...
private static byte[] LoadDefaultWordDelimTable() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
{
var tab = new byte[256];
for (int i = 0; i < 256; i++)
{
byte code = 0;
if (Character.IsLower(i))
{
code |= WordDelimiterFilter.LOWER;
}
else if (Character.IsUpper(i))
{
code |= WordDelimiterFilter.UPPER;
}
else if (Character.IsDigit(i))
{
code |= WordDelimiterFilter.DIGIT;
}
if (code == 0)
{
code = WordDelimiterFilter.SUBWORD_DELIM;
}
tab[i] = code;
}
return tab;
}
/// <summary>
/// Create a new <see cref="WordDelimiterIterator"/> operating with the supplied rules.
/// </summary>
/// <param name="charTypeTable"> table containing character types </param>
/// <param name="splitOnCaseChange"> if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) </param>
/// <param name="splitOnNumerics"> if true, causes "j2se" to be three tokens; "j" "2" "se" </param>
/// <param name="stemEnglishPossessive"> if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" </param>
internal WordDelimiterIterator(byte[] charTypeTable, bool splitOnCaseChange, bool splitOnNumerics, bool stemEnglishPossessive)
{
this.charTypeTable = charTypeTable;
this.splitOnCaseChange = splitOnCaseChange;
this.splitOnNumerics = splitOnNumerics;
this.stemEnglishPossessive = stemEnglishPossessive;
}
/// <summary>
/// Advance to the next subword in the string.
/// </summary>
/// <returns> index of the next subword, or <see cref="DONE"/> if all subwords have been returned </returns>
internal int Next()
{
current = end;
if (current == DONE)
{
return DONE;
}
if (skipPossessive)
{
current += 2;
skipPossessive = false;
}
int lastType = 0;
while (current < endBounds && (WordDelimiterFilter.IsSubwordDelim(lastType = CharType(text[current]))))
{
current++;
}
if (current >= endBounds)
{
return end = DONE;
}
for (end = current + 1; end < endBounds; end++)
{
int type = CharType(text[end]);
if (IsBreak(lastType, type))
{
break;
}
lastType = type;
}
if (end < endBounds - 1 && EndsWithPossessive(end + 2))
{
skipPossessive = true;
}
return end;
}
/// <summary>
/// Return the type of the current subword.
/// This currently uses the type of the first character in the subword.
/// </summary>
/// <returns> type of the current word </returns>
internal int Type
{
get
{
if (end == DONE)
{
return 0;
}
int type = CharType(text[current]);
switch (type)
{
// return ALPHA word type for both lower and upper
case WordDelimiterFilter.LOWER:
case WordDelimiterFilter.UPPER:
return WordDelimiterFilter.ALPHA;
default:
return type;
}
}
}
/// <summary>
/// Reset the text to a new value, and reset all state
/// </summary>
/// <param name="text"> New text </param>
/// <param name="length"> length of the text </param>
internal void SetText(char[] text, int length)
{
this.text = text;
this.length = this.endBounds = length;
current = startBounds = end = 0;
skipPossessive = hasFinalPossessive = false;
SetBounds();
}
// ================================================= Helper Methods ================================================
/// <summary>
/// Determines whether the transition from lastType to type indicates a break
/// </summary>
/// <param name="lastType"> Last subword type </param>
/// <param name="type"> Current subword type </param>
/// <returns> <c>true</c> if the transition indicates a break, <c>false</c> otherwise </returns>
private bool IsBreak(int lastType, int type)
{
if ((type & lastType) != 0)
{
return false;
}
if (!splitOnCaseChange && WordDelimiterFilter.IsAlpha(lastType) && WordDelimiterFilter.IsAlpha(type))
{
// ALPHA->ALPHA: always ignore if case isn't considered.
return false;
}
else if (WordDelimiterFilter.IsUpper(lastType) && WordDelimiterFilter.IsAlpha(type))
{
// UPPER->letter: Don't split
return false;
}
else if (!splitOnNumerics && ((WordDelimiterFilter.IsAlpha(lastType) && WordDelimiterFilter.IsDigit(type)) || (WordDelimiterFilter.IsDigit(lastType) && WordDelimiterFilter.IsAlpha(type))))
{
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
return false;
}
return true;
}
/// <summary>
/// Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters
/// </summary>
/// <returns> <c>true</c> if the current word contains only one subword, <c>false</c> otherwise </returns>
internal bool IsSingleWord()
{
if (hasFinalPossessive)
{
return current == startBounds && end == endBounds - 2;
}
else
{
return current == startBounds && end == endBounds;
}
}
/// <summary>
/// Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
/// it yet, simply note it.
/// </summary>
private void SetBounds()
{
while (startBounds < length && (WordDelimiterFilter.IsSubwordDelim(CharType(text[startBounds]))))
{
startBounds++;
}
while (endBounds > startBounds && (WordDelimiterFilter.IsSubwordDelim(CharType(text[endBounds - 1]))))
{
endBounds--;
}
if (EndsWithPossessive(endBounds))
{
hasFinalPossessive = true;
}
current = startBounds;
}
/// <summary>
/// Determines if the text at the given position indicates an English possessive which should be removed
/// </summary>
/// <param name="pos"> Position in the text to check if it indicates an English possessive </param>
/// <returns> <c>true</c> if the text at the position indicates an English posessive, <c>false</c> otherwise </returns>
private bool EndsWithPossessive(int pos)
{
return (stemEnglishPossessive &&
pos > 2 &&
text[pos - 2] == '\'' &&
(text[pos - 1] == 's' || text[pos - 1] == 'S') &&
WordDelimiterFilter.IsAlpha(CharType(text[pos - 3])) &&
(pos == endBounds || WordDelimiterFilter.IsSubwordDelim(CharType(text[pos]))));
}
/// <summary>
/// Determines the type of the given character
/// </summary>
/// <param name="ch"> Character whose type is to be determined </param>
/// <returns> Type of the character </returns>
private int CharType(int ch)
{
if (ch < charTypeTable.Length)
{
return charTypeTable[ch];
}
return GetType(ch);
}
/// <summary>
/// Computes the type of the given character
/// </summary>
/// <param name="ch"> Character whose type is to be determined </param>
/// <returns> Type of the character </returns>
public static byte GetType(int ch)
{
switch (Character.GetType(ch))
{
case UnicodeCategory.UppercaseLetter:
return WordDelimiterFilter.UPPER;
case UnicodeCategory.LowercaseLetter:
return WordDelimiterFilter.LOWER;
case UnicodeCategory.TitlecaseLetter:
case UnicodeCategory.ModifierLetter:
case UnicodeCategory.OtherLetter:
case UnicodeCategory.NonSpacingMark:
case UnicodeCategory.EnclosingMark: // depends what it encloses?
case UnicodeCategory.SpacingCombiningMark:
return WordDelimiterFilter.ALPHA;
case UnicodeCategory.DecimalDigitNumber:
case UnicodeCategory.LetterNumber:
case UnicodeCategory.OtherNumber:
return WordDelimiterFilter.DIGIT;
// case Character.SPACE_SEPARATOR:
// case Character.LINE_SEPARATOR:
// case Character.PARAGRAPH_SEPARATOR:
// case Character.CONTROL:
// case Character.FORMAT:
// case Character.PRIVATE_USE:
case UnicodeCategory.Surrogate:
return WordDelimiterFilter.ALPHA | WordDelimiterFilter.DIGIT;
// case Character.DASH_PUNCTUATION:
// case Character.START_PUNCTUATION:
// case Character.END_PUNCTUATION:
// case Character.CONNECTOR_PUNCTUATION:
// case Character.OTHER_PUNCTUATION:
// case Character.MATH_SYMBOL:
// case Character.CURRENCY_SYMBOL:
// case Character.MODIFIER_SYMBOL:
// case Character.OTHER_SYMBOL:
// case Character.INITIAL_QUOTE_PUNCTUATION:
// case Character.FINAL_QUOTE_PUNCTUATION:
default:
return WordDelimiterFilter.SUBWORD_DELIM;
}
}
}
}