blob: 012f8bb669630d50a8542ac97d40578abcaa804e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST;
/**
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.
* It conforms to the algorithm in the original hunspell algorithm, including recursive suffix
* stripping.
*/
final class Stemmer {
private final Dictionary dictionary;
// it's '1' if we have no stem exceptions, otherwise every other form
// is really an ID pointing to the exception table
private final int formStep;
/**
* Constructs a new Stemmer which will use the provided Dictionary to create its stems.
*
* @param dictionary Dictionary that will be used to create the stems
*/
public Stemmer(Dictionary dictionary) {
this.dictionary = dictionary;
formStep = dictionary.formStep();
}
/**
* Find the stem(s) of the provided word.
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<CharsRef> stem(String word) {
return stem(word.toCharArray(), word.length());
}
/**
* Find the stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<CharsRef> stem(char[] word, int length) {
if (dictionary.mayNeedInputCleaning()) {
CharsRef scratchSegment = new CharsRef(word, 0, length);
if (dictionary.needsInputCleaning(scratchSegment)) {
StringBuilder segment = new StringBuilder();
dictionary.cleanInput(scratchSegment, segment);
char[] scratchBuffer = new char[segment.length()];
length = segment.length();
segment.getChars(0, length, scratchBuffer, 0);
word = scratchBuffer;
}
}
List<CharsRef> list = new ArrayList<>();
if (length == 0) {
return list;
}
RootProcessor processor =
(stem, formID, stemException) -> {
list.add(newStem(stem, stemException));
return true;
};
if (!doStem(word, 0, length, WordContext.SIMPLE_WORD, processor)) {
return list;
}
WordCase wordCase = caseOf(word, length);
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
CaseVariationProcessor variationProcessor =
(variant, varLength, originalCase) ->
doStem(variant, 0, varLength, WordContext.SIMPLE_WORD, processor);
varyCase(word, length, wordCase, variationProcessor);
}
return list;
}
interface CaseVariationProcessor {
boolean process(char[] word, int length, WordCase originalCase);
}
boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) {
char[] titleBuffer = wordCase == WordCase.UPPER ? caseFoldTitle(word, length) : null;
if (wordCase == WordCase.UPPER) {
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
if (aposCase != null && !processor.process(aposCase, length, wordCase)) {
return false;
}
if (!processor.process(titleBuffer, length, wordCase)) {
return false;
}
if (dictionary.checkSharpS && !varySharpS(titleBuffer, length, processor)) {
return false;
}
}
if (dictionary.isDotICaseChangeDisallowed(word)) {
return true;
}
char[] lowerBuffer = caseFoldLower(titleBuffer != null ? titleBuffer : word, length);
if (!processor.process(lowerBuffer, length, wordCase)) {
return false;
}
if (wordCase == WordCase.UPPER
&& dictionary.checkSharpS
&& !varySharpS(lowerBuffer, length, processor)) {
return false;
}
return true;
}
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
WordCase caseOf(char[] word, int length) {
if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
return WordCase.MIXED;
}
return WordCase.caseOf(word, length);
}
/** folds titlecase variant of word to titleBuffer */
private char[] caseFoldTitle(char[] word, int length) {
char[] titleBuffer = new char[length];
System.arraycopy(word, 0, titleBuffer, 0, length);
for (int i = 1; i < length; i++) {
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
}
return titleBuffer;
}
/** folds lowercase variant of word (title cased) to lowerBuffer */
private char[] caseFoldLower(char[] word, int length) {
char[] lowerBuffer = new char[length];
System.arraycopy(word, 0, lowerBuffer, 0, length);
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
return lowerBuffer;
}
// Special prefix handling for Catalan, French, Italian:
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
private static char[] capitalizeAfterApostrophe(char[] word, int length) {
for (int i = 1; i < length - 1; i++) {
if (word[i] == '\'') {
char next = word[i + 1];
char upper = Character.toUpperCase(next);
if (upper != next) {
char[] copy = ArrayUtil.copyOfSubArray(word, 0, length);
copy[i + 1] = Character.toUpperCase(upper);
return copy;
}
}
}
return null;
}
private boolean varySharpS(char[] word, int length, CaseVariationProcessor processor) {
Stream<String> result =
new Object() {
int findSS(int start) {
for (int i = start; i < length - 1; i++) {
if (word[i] == 's' && word[i + 1] == 's') {
return i;
}
}
return -1;
}
Stream<String> replaceSS(int start, int depth) {
if (depth > 5) { // cut off too large enumeration
return Stream.of(new String(word, start, length - start));
}
int ss = findSS(start);
if (ss < 0) {
return null;
} else {
String prefix = new String(word, start, ss - start);
Stream<String> tails = replaceSS(ss + 2, depth + 1);
if (tails == null) {
tails = Stream.of(new String(word, ss + 2, length - ss - 2));
}
return tails.flatMap(s -> Stream.of(prefix + "ss" + s, prefix + "ß" + s));
}
}
}.replaceSS(0, 0);
if (result == null) return true;
String src = new String(word, 0, length);
for (String s : result.collect(Collectors.toList())) {
if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
return false;
}
}
return true;
}
boolean doStem(
char[] word, int offset, int length, WordContext context, RootProcessor processor) {
IntsRef forms = dictionary.lookupWord(word, offset, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
int entryId = forms.ints[forms.offset + i];
// we can't add this form, it's a pseudostem requiring an affix
if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
continue;
}
if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE)
&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
return false;
}
if (!isRootCompatibleWithContext(context, -1, entryId)) {
continue;
}
if (!callProcessor(word, offset, length, processor, forms, i)) {
return false;
}
}
}
return stem(
word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
}
/**
* Find the unique stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<CharsRef> uniqueStems(char[] word, int length) {
List<CharsRef> stems = stem(word, length);
if (stems.size() < 2) {
return stems;
}
CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
List<CharsRef> deduped = new ArrayList<>();
for (CharsRef s : stems) {
if (!terms.contains(s)) {
deduped.add(s);
terms.add(s);
}
}
return deduped;
}
interface RootProcessor {
/**
* @param stem the text of the found dictionary entry
* @param formID internal id of the dictionary entry, e.g. to be used in {@link
* Dictionary#hasFlag(int, char)}
* @param morphDataId the id of the custom morphological data (0 if none), to be used with
* {@link Dictionary#morphData}
* @return whether the processing should be continued
*/
boolean processRoot(CharsRef stem, int formID, int morphDataId);
}
private String stemException(int morphDataId) {
if (morphDataId > 0) {
String data = dictionary.morphData.get(morphDataId);
int start = data.startsWith("st:") ? 0 : data.indexOf(" st:");
if (start >= 0) {
int nextSpace = data.indexOf(' ', start + 3);
return data.substring(start + 3, nextSpace < 0 ? data.length() : nextSpace);
}
}
return null;
}
private CharsRef newStem(CharsRef stem, int morphDataId) {
String exception = stemException(morphDataId);
if (dictionary.oconv != null) {
StringBuilder scratchSegment = new StringBuilder();
if (exception != null) {
scratchSegment.append(exception);
} else {
scratchSegment.append(stem.chars, stem.offset, stem.length);
}
dictionary.oconv.applyMappings(scratchSegment);
char[] cleaned = new char[scratchSegment.length()];
scratchSegment.getChars(0, cleaned.length, cleaned, 0);
return new CharsRef(cleaned, 0, cleaned.length);
} else {
if (exception != null) {
return new CharsRef(exception);
} else {
return stem;
}
}
}
/**
* Generates a list of stems for the provided word
*
* @param word Word to generate the stems for
* @param previous previous affix that was removed (so we dont remove same one twice)
* @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
* affixes in this recursive step
* @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
* checked against the word
* @param recursionDepth current recursiondepth
* @param doPrefix true if we should remove prefixes
* @param previousWasPrefix true if the previous removal was a prefix: if we are removing a
* suffix, and it has no continuation requirements, it's ok. but two prefixes
* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
* @return whether the processing should be continued
*/
private boolean stem(
char[] word,
int offset,
int length,
WordContext context,
int previous,
char prevFlag,
int prefixId,
int recursionDepth,
boolean doPrefix,
boolean previousWasPrefix,
RootProcessor processor) {
FST.Arc<IntsRef> arc = new FST.Arc<>();
if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes;
FST.BytesReader reader = fst.getBytesReader();
fst.getFirstArc(arc);
IntsRef output = fst.outputs.getNoOutput();
int limit = dictionary.fullStrip ? length + 1 : length;
for (int i = 0; i < limit; i++) {
if (i > 0) {
output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i - 1]);
if (output == null) {
break;
}
}
if (!arc.isFinal()) {
continue;
}
IntsRef prefixes = fst.outputs.add(output, arc.nextFinalOutput());
for (int j = 0; j < prefixes.length; j++) {
int prefix = prefixes.ints[prefixes.offset + j];
if (prefix == previous) {
continue;
}
if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
if (strippedWord == null) {
continue;
}
boolean pureAffix = strippedWord == word;
if (!applyAffix(
strippedWord,
pureAffix ? offset + i : 0,
pureAffix ? length - i : strippedWord.length,
context,
prefix,
previous,
-1,
recursionDepth,
true,
processor)) {
return false;
}
}
}
}
}
if (dictionary.suffixes != null) {
FST<IntsRef> fst = dictionary.suffixes;
FST.BytesReader reader = fst.getBytesReader();
fst.getFirstArc(arc);
IntsRef output = fst.outputs.getNoOutput();
int limit = dictionary.fullStrip ? 0 : 1;
for (int i = length; i >= limit; i--) {
if (i < length) {
output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i]);
if (output == null) {
break;
}
}
if (!arc.isFinal()) {
continue;
}
IntsRef suffixes = fst.outputs.add(output, arc.nextFinalOutput());
for (int j = 0; j < suffixes.length; j++) {
int suffix = suffixes.ints[suffixes.offset + j];
if (suffix == previous) {
continue;
}
if (isAffixCompatible(
suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
if (strippedWord == null) {
continue;
}
boolean pureAffix = strippedWord == word;
if (!applyAffix(
strippedWord,
pureAffix ? offset : 0,
pureAffix ? i : strippedWord.length,
context,
suffix,
previous,
prefixId,
recursionDepth,
false,
processor)) {
return false;
}
}
}
}
}
return true;
}
/**
* @return null if affix conditions isn't met; a reference to the same char[] if the affix has no
* strip data and can thus be simply removed, or a new char[] containing the word affix
* removal
*/
private char[] stripAffix(
char[] word, int offset, int length, int affixLen, int affix, boolean isPrefix) {
int deAffixedLen = length - affixLen;
int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLen = stripEnd - stripStart;
if (stripLen + deAffixedLen == 0) return null;
char[] stripData = dictionary.stripData;
int condition = dictionary.getAffixCondition(affix);
if (condition != 0) {
int deAffixedOffset = isPrefix ? offset + affixLen : offset;
if (!dictionary.patterns.get(condition).acceptsStem(word, deAffixedOffset, deAffixedLen)) {
return null;
}
}
if (stripLen == 0) return word;
char[] strippedWord = new char[stripLen + deAffixedLen];
System.arraycopy(
word,
offset + (isPrefix ? affixLen : 0),
strippedWord,
isPrefix ? stripLen : 0,
deAffixedLen);
System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
return strippedWord;
}
private boolean isAffixCompatible(
int affix,
char prevFlag,
int recursionDepth,
boolean isPrefix,
boolean previousWasPrefix,
WordContext context) {
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
if (context.isCompound()) {
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
return false;
}
if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
&& !dictionary.hasFlag(append, dictionary.compoundPermit)) {
return false;
}
if (context == WordContext.COMPOUND_END
&& !isPrefix
&& !previousWasPrefix
&& dictionary.hasFlag(append, dictionary.onlyincompound)) {
return false;
}
} else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
return false;
}
if (recursionDepth == 0) {
return true;
}
if (dictionary.isCrossProduct(affix)) {
// cross check incoming continuation class (flag of previous affix) against list.
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
}
return false;
}
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
*
* @param strippedWord Char array containing the word with the affix removed and the strip added
* @param offset where the word actually starts in the array
* @param length the length of the stripped word
* @param affix HunspellAffix representing the affix rule itself
* @param prefixId when we already stripped a prefix, we can't simply recurse and check the
* suffix, unless both are compatible so we must check dictionary form against both to add it
* as a stem!
* @param recursionDepth current recursion depth
* @param prefix true if we are removing a prefix (false if it's a suffix)
* @return whether the processing should be continued
*/
private boolean applyAffix(
char[] strippedWord,
int offset,
int length,
WordContext context,
int affix,
int previousAffix,
int prefixId,
int recursionDepth,
boolean prefix,
RootProcessor processor) {
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
int entryId = forms.ints[forms.offset + i];
if (dictionary.hasFlag(entryId, flag) || isFlagAppendedByAffix(prefixId, flag)) {
// confusing: in this one exception, we already chained the first prefix against the
// second,
// so it doesnt need to be checked against the word
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
if (!chainedPrefix && prefixId >= 0) {
char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
if (!dictionary.hasFlag(entryId, prefixFlag)
&& !isFlagAppendedByAffix(affix, prefixFlag)) {
continue;
}
}
if (!isRootCompatibleWithContext(context, affix, entryId)) {
continue;
}
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
return false;
}
}
}
}
if (dictionary.isCrossProduct(affix) && recursionDepth <= 1) {
boolean doPrefix;
if (recursionDepth == 0) {
if (prefix) {
prefixId = affix;
doPrefix = dictionary.complexPrefixes && dictionary.isSecondStagePrefix(flag);
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
// COMPLEXPREFIXES = false: combine with a suffix
} else if (!dictionary.complexPrefixes && dictionary.isSecondStageSuffix(flag)) {
doPrefix = false;
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
} else {
return true;
}
} else {
doPrefix = false;
if (prefix && dictionary.complexPrefixes) {
prefixId = affix;
// we took away the second prefix: go look for another suffix
} else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageSuffix(flag)) {
return true;
}
// we took away a prefix, then a suffix: go look for another suffix
}
return stem(
strippedWord,
offset,
length,
context,
affix,
flag,
prefixId,
recursionDepth + 1,
doPrefix,
prefix,
processor);
}
return true;
}
private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
return false;
}
if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
char cFlag = context.requiredFlag(dictionary);
return dictionary.hasFlag(entryId, cFlag)
|| isFlagAppendedByAffix(lastAffix, cFlag)
|| dictionary.hasFlag(entryId, dictionary.compoundFlag)
|| isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
}
return true;
}
private boolean callProcessor(
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
CharsRef stem = new CharsRef(word, offset, length);
int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId);
}
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
char circumfix = dictionary.circumfix;
// if circumfix was previously set by a prefix, we must check this suffix,
// to ensure it has it, and vice versa
if (isSuffix
&& isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
return true;
}
if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
return !isSuffix
|| previousAffix < 0
|| isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
}
return false;
}
private boolean isFlagAppendedByAffix(int affixId, char flag) {
if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false;
int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
return dictionary.hasFlag(appendId, flag);
}
}