blob: 93883168aacc0d6413a7bb52738f105dc4e75929 [file] [log] [blame]
using Lucene.Net.Analysis.Util;
using Lucene.Net.Diagnostics;
using Lucene.Net.Store;
using Lucene.Net.Util;
using Lucene.Net.Util.Automaton;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace Lucene.Net.Analysis.Hunspell
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Stemmer uses the affix rules declared in the <see cref="Dictionary"/> to generate one or more stems for a word. It
/// conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
/// </summary>
internal sealed class Stemmer
{
private readonly Dictionary dictionary;
private readonly BytesRef scratch = new BytesRef();
private readonly StringBuilder segment = new StringBuilder();
private readonly ByteArrayDataInput affixReader;
// used for normalization
private readonly StringBuilder scratchSegment = new StringBuilder();
private char[] scratchBuffer = new char[32];
/// <summary>
/// Constructs a new Stemmer which will use the provided <see cref="Dictionary"/> to create its stems.
/// </summary>
/// <param name="dictionary"> <see cref="Dictionary"/> that will be used to create the stems </param>
public Stemmer(Dictionary dictionary)
{
this.dictionary = dictionary;
this.affixReader = new ByteArrayDataInput(dictionary.affixData);
}
/// <summary>
/// Find the stem(s) of the provided word.
/// </summary>
/// <param name="word"> Word to find the stems for </param>
/// <returns> <see cref="IList{CharsRef}"/> of stems for the word </returns>
public IList<CharsRef> Stem(string word)
{
return Stem(word.ToCharArray(), word.Length);
}
/// <summary>
/// Find the stem(s) of the provided word
/// </summary>
/// <param name="word"> Word to find the stems for </param>
/// <param name="length"> length </param>
/// <returns> <see cref="IList{CharsRef}"/> of stems for the word </returns>
public IList<CharsRef> Stem(char[] word, int length)
{
if (dictionary.needsInputCleaning)
{
scratchSegment.Length = 0;
scratchSegment.Append(word, 0, length);
string cleaned = dictionary.CleanInput(scratchSegment.ToString(), segment);
scratchBuffer = ArrayUtil.Grow(scratchBuffer, cleaned.Length);
length = segment.Length;
segment.CopyTo(0, scratchBuffer, 0, length);
word = scratchBuffer;
}
List<CharsRef> stems = new List<CharsRef>();
Int32sRef forms = dictionary.LookupWord(word, 0, length);
if (forms != null)
{
// TODO: some forms should not be added, e.g. ONLYINCOMPOUND
// just because it exists, does not make it valid...
for (int i = 0; i < forms.Length; i++)
{
stems.Add(NewStem(word, length));
}
}
stems.AddRange(Stem(word, length, -1, -1, -1, 0, true, true, false, false));
return stems;
}
/// <summary>
/// Find the unique stem(s) of the provided word
/// </summary>
/// <param name="word"> Word to find the stems for </param>
/// <param name="length"> length </param>
/// <returns> <see cref="IList{CharsRef}"/> of stems for the word </returns>
public IList<CharsRef> UniqueStems(char[] word, int length)
{
IList<CharsRef> stems = Stem(word, length);
if (stems.Count < 2)
{
return stems;
}
CharArraySet terms = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT, 8, dictionary.ignoreCase);
#pragma warning restore 612, 618
IList<CharsRef> deduped = new List<CharsRef>();
foreach (CharsRef s in stems)
{
if (!terms.Contains(s))
{
deduped.Add(s);
terms.Add(s);
}
}
return deduped;
}
private CharsRef NewStem(char[] buffer, int length)
{
if (dictionary.needsOutputCleaning)
{
scratchSegment.Length = 0;
scratchSegment.Append(buffer, 0, length);
try
{
Dictionary.ApplyMappings(dictionary.oconv, scratchSegment);
}
catch (IOException bogus)
{
throw new Exception(bogus.Message, bogus);
}
char[] cleaned = new char[scratchSegment.Length];
scratchSegment.CopyTo(0, cleaned, 0, cleaned.Length);
return new CharsRef(cleaned, 0, cleaned.Length);
}
else
{
return new CharsRef(buffer, 0, length);
}
}
// ================================================= Helper Methods ================================================
/// <summary>
/// Generates a list of stems for the provided word
/// </summary>
/// <param name="word"> Word to generate the stems for </param>
/// <param name="length"> length </param>
/// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param>
/// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param>
/// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param>
/// <param name="recursionDepth"> current recursiondepth </param>
/// <param name="doPrefix"> true if we should remove prefixes </param>
/// <param name="doSuffix"> true if we should remove suffixes </param>
/// <param name="previousWasPrefix"> true if the previous removal was a prefix:
/// if we are removing a suffix, and it has no continuation requirements, its ok.
/// but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param>
/// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix
/// this means inner most suffix must also contain circumfix flag. </param>
/// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns>
private IList<CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix)
{
// TODO: allow this stuff to be reused by tokenfilter
List<CharsRef> stems = new List<CharsRef>();
if (doPrefix && dictionary.prefixes != null)
{
for (int i = length - 1; i >= 0; i--)
{
Int32sRef prefixes = dictionary.LookupPrefix(word, 0, i);
if (prefixes == null)
{
continue;
}
for (int j = 0; j < prefixes.Length; j++)
{
int prefix = prefixes.Int32s[prefixes.Offset + j];
if (prefix == previous)
{
continue;
}
affixReader.Position = 8 * prefix;
char flag = (char)(affixReader.ReadInt16() & 0xffff);
char stripOrd = (char)(affixReader.ReadInt16() & 0xffff);
int condition = (char)(affixReader.ReadInt16() & 0xffff);
bool crossProduct = (condition & 1) == 1;
condition = (int)((uint)condition >> 1);
char append = (char)(affixReader.ReadInt16() & 0xffff);
bool compatible;
if (recursionDepth == 0)
{
compatible = true;
}
else if (crossProduct)
{
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.Get(append, scratch);
char[] appendFlags = Dictionary.DecodeFlags(scratch);
if (Debugging.AssertsEnabled) Debugging.Assert(prevFlag >= 0);
compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, false);
}
else
{
compatible = false;
}
if (compatible)
{
int deAffixedStart = i;
int deAffixedLength = length - deAffixedStart;
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength))
{
continue;
}
char[] strippedWord = new char[stripLength + deAffixedLength];
Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
IList<CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix);
stems.AddRange(stemList);
}
}
}
}
if (doSuffix && dictionary.suffixes != null)
{
for (int i = 0; i < length; i++)
{
Int32sRef suffixes = dictionary.LookupSuffix(word, i, length - i);
if (suffixes == null)
{
continue;
}
for (int j = 0; j < suffixes.Length; j++)
{
int suffix = suffixes.Int32s[suffixes.Offset + j];
if (suffix == previous)
{
continue;
}
affixReader.Position = 8 * suffix;
char flag = (char)(affixReader.ReadInt16() & 0xffff);
char stripOrd = (char)(affixReader.ReadInt16() & 0xffff);
int condition = (char)(affixReader.ReadInt16() & 0xffff);
bool crossProduct = (condition & 1) == 1;
condition = (int)((uint)condition >> 1);
char append = (char)(affixReader.ReadInt16() & 0xffff);
bool compatible;
if (recursionDepth == 0)
{
compatible = true;
}
else if (crossProduct)
{
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.Get(append, scratch);
char[] appendFlags = Dictionary.DecodeFlags(scratch);
if (Debugging.AssertsEnabled) Debugging.Assert(prevFlag >= 0);
compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
}
else
{
compatible = false;
}
if (compatible)
{
int appendLength = length - i;
int deAffixedLength = length - appendLength;
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength))
{
continue;
}
char[] strippedWord = new char[stripLength + deAffixedLength];
Array.Copy(word, 0, strippedWord, 0, deAffixedLength);
Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
IList<CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix);
stems.AddRange(stemList);
}
}
}
}
return stems;
}
/// <summary>
/// checks condition of the concatenation of two strings </summary>
// note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem
// but this is a little bit more complicated.
private bool CheckCondition(int condition, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len)
{
if (condition != 0)
{
CharacterRunAutomaton pattern = dictionary.patterns[condition];
int state = pattern.InitialState;
for (int i = c1off; i < c1off + c1len; i++)
{
state = pattern.Step(state, c1[i]);
if (state == -1)
{
return false;
}
}
for (int i = c2off; i < c2off + c2len; i++)
{
state = pattern.Step(state, c2[i]);
if (state == -1)
{
return false;
}
}
return pattern.IsAccept(state);
}
return true;
}
/// <summary>
/// Applies the affix rule to the given word, producing a list of stems if any are found
/// </summary>
/// <param name="strippedWord"> Word the affix has been removed and the strip added </param>
/// <param name="length"> valid length of stripped word </param>
/// <param name="affix"> HunspellAffix representing the affix rule itself </param>
/// <param name="prefixFlag"> when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
/// so we must check dictionary form against both to add it as a stem! </param>
/// <param name="recursionDepth"> current recursion depth </param>
/// <param name="prefix"> true if we are removing a prefix (false if its a suffix) </param>
/// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix
/// this means inner most suffix must also contain circumfix flag. </param>
/// <returns> <see cref="IList{CharsRef}"/> of stems for the word, or an empty list if none are found </returns>
internal IList<CharsRef> ApplyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, bool prefix, bool circumfix)
{
// TODO: just pass this in from before, no need to decode it twice
affixReader.Position = 8 * affix;
char flag = (char)(affixReader.ReadInt16() & 0xffff);
affixReader.SkipBytes(2); // strip
int condition = (char)(affixReader.ReadInt16() & 0xffff);
bool crossProduct = (condition & 1) == 1;
condition = (int)((uint)condition >> 1);
char append = (char)(affixReader.ReadInt16() & 0xffff);
List<CharsRef> stems = new List<CharsRef>();
Int32sRef forms = dictionary.LookupWord(strippedWord, 0, length);
if (forms != null)
{
for (int i = 0; i < forms.Length; i++)
{
dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch);
char[] wordFlags = Dictionary.DecodeFlags(scratch);
if (Dictionary.HasFlag(wordFlags, flag))
{
// confusing: in this one exception, we already chained the first prefix against the second,
// so it doesnt need to be checked against the word
bool chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.HasFlag(wordFlags, (char)prefixFlag))
{
// see if we can chain prefix thru the suffix continuation class (only if it has any!)
dictionary.flagLookup.Get(append, scratch);
char[] appendFlags = Dictionary.DecodeFlags(scratch);
if (!HasCrossCheckedFlag((char)prefixFlag, appendFlags, false))
{
continue;
}
}
// if circumfix was previously set by a prefix, we must check this suffix,
// to ensure it has it, and vice versa
if (dictionary.circumfix != -1)
{
dictionary.flagLookup.Get(append, scratch);
char[] appendFlags = Dictionary.DecodeFlags(scratch);
bool suffixCircumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix);
if (circumfix != suffixCircumfix)
{
continue;
}
}
stems.Add(NewStem(strippedWord, length));
}
}
}
// if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
if (dictionary.circumfix != -1 && !circumfix && prefix)
{
dictionary.flagLookup.Get(append, scratch);
char[] appendFlags = Dictionary.DecodeFlags(scratch);
circumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix);
}
if (crossProduct)
{
if (recursionDepth == 0)
{
if (prefix)
{
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
// COMPLEXPREFIXES = false: combine with a suffix
stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
}
else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix)
{
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
}
}
else if (recursionDepth == 1)
{
if (prefix && dictionary.complexPrefixes)
{
// we took away the second prefix: go look for another suffix
stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
}
else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix)
{
// we took away a prefix, then a suffix: go look for another suffix
stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
}
}
}
return stems;
}
/// <summary>
/// Checks if the given flag cross checks with the given array of flags
/// </summary>
/// <param name="flag"> Flag to cross check with the array of flags </param>
/// <param name="flags"> Array of flags to cross check against. Can be <c>null</c> </param>
/// <param name="matchEmpty"> If true, will match a zero length flags array. </param>
/// <returns> <c>true</c> if the flag is found in the array or the array is <c>null</c>, <c>false</c> otherwise </returns>
private bool HasCrossCheckedFlag(char flag, char[] flags, bool matchEmpty)
{
return (flags.Length == 0 && matchEmpty) || Array.BinarySearch(flags, flag) >= 0;
}
}
}