lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.hunspell;

 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.FST;

 /**
  * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.
  * It conforms to the algorithm in the original hunspell algorithm, including recursive suffix
  * stripping.
  */
 final class Stemmer {
   private final Dictionary dictionary;

   // it's '1' if we have no stem exceptions, otherwise every other form
   // is really an ID pointing to the exception table
   private final int formStep;

   /**
    * Constructs a new Stemmer which will use the provided Dictionary to create its stems.
    *
    * @param dictionary Dictionary that will be used to create the stems
    */
   public Stemmer(Dictionary dictionary) {
     this.dictionary = dictionary;
     formStep = dictionary.formStep();
   }

   /**
    * Find the stem(s) of the provided word.
    *
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
   public List<CharsRef> stem(String word) {
     return stem(word.toCharArray(), word.length());
   }

   /**
    * Find the stem(s) of the provided word
    *
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
   public List<CharsRef> stem(char[] word, int length) {

     if (dictionary.mayNeedInputCleaning()) {
       CharsRef scratchSegment = new CharsRef(word, 0, length);
       if (dictionary.needsInputCleaning(scratchSegment)) {
         StringBuilder segment = new StringBuilder();
         dictionary.cleanInput(scratchSegment, segment);
         char[] scratchBuffer = new char[segment.length()];
         length = segment.length();
         segment.getChars(0, length, scratchBuffer, 0);
         word = scratchBuffer;
       }
     }

     List<CharsRef> list = new ArrayList<>();
     if (length == 0) {
       return list;
     }

     RootProcessor processor =
         (stem, formID, stemException) -> {
           list.add(newStem(stem, stemException));
           return true;
         };
     if (!doStem(word, 0, length, WordContext.SIMPLE_WORD, processor)) {
       return list;
     }

     WordCase wordCase = caseOf(word, length);
     if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
       CaseVariationProcessor variationProcessor =
           (variant, varLength, originalCase) ->
               doStem(variant, 0, varLength, WordContext.SIMPLE_WORD, processor);
       varyCase(word, length, wordCase, variationProcessor);
     }
     return list;
   }

   interface CaseVariationProcessor {
     boolean process(char[] word, int length, WordCase originalCase);
   }

   boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) {
     char[] titleBuffer = wordCase == WordCase.UPPER ? caseFoldTitle(word, length) : null;
     if (wordCase == WordCase.UPPER) {
       char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
       if (aposCase != null && !processor.process(aposCase, length, wordCase)) {
         return false;
       }
       if (!processor.process(titleBuffer, length, wordCase)) {
         return false;
       }
       if (dictionary.checkSharpS && !varySharpS(titleBuffer, length, processor)) {
         return false;
       }
     }

     if (dictionary.isDotICaseChangeDisallowed(word)) {
       return true;
     }

     char[] lowerBuffer = caseFoldLower(titleBuffer != null ? titleBuffer : word, length);
     if (!processor.process(lowerBuffer, length, wordCase)) {
       return false;
     }
     if (wordCase == WordCase.UPPER
         && dictionary.checkSharpS
         && !varySharpS(lowerBuffer, length, processor)) {
       return false;
     }
     return true;
   }

   /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
   WordCase caseOf(char[] word, int length) {
     if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
       return WordCase.MIXED;
     }

     return WordCase.caseOf(word, length);
   }

   /** folds titlecase variant of word to titleBuffer */
   private char[] caseFoldTitle(char[] word, int length) {
     char[] titleBuffer = new char[length];
     System.arraycopy(word, 0, titleBuffer, 0, length);
     for (int i = 1; i < length; i++) {
       titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
     }
     return titleBuffer;
   }

   /** folds lowercase variant of word (title cased) to lowerBuffer */
   private char[] caseFoldLower(char[] word, int length) {
     char[] lowerBuffer = new char[length];
     System.arraycopy(word, 0, lowerBuffer, 0, length);
     lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
     return lowerBuffer;
   }

   // Special prefix handling for Catalan, French, Italian:
   // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
   private static char[] capitalizeAfterApostrophe(char[] word, int length) {
     for (int i = 1; i < length - 1; i++) {
       if (word[i] == '\'') {
         char next = word[i + 1];
         char upper = Character.toUpperCase(next);
         if (upper != next) {
           char[] copy = ArrayUtil.copyOfSubArray(word, 0, length);
           copy[i + 1] = Character.toUpperCase(upper);
           return copy;
         }
       }
     }
     return null;
   }

   private boolean varySharpS(char[] word, int length, CaseVariationProcessor processor) {
     Stream<String> result =
         new Object() {
           int findSS(int start) {
             for (int i = start; i < length - 1; i++) {
               if (word[i] == 's' && word[i + 1] == 's') {
                 return i;
               }
             }
             return -1;
           }

           Stream<String> replaceSS(int start, int depth) {
             if (depth > 5) { // cut off too large enumeration
               return Stream.of(new String(word, start, length - start));
             }

             int ss = findSS(start);
             if (ss < 0) {
               return null;
             } else {
               String prefix = new String(word, start, ss - start);
               Stream<String> tails = replaceSS(ss + 2, depth + 1);
               if (tails == null) {
                 tails = Stream.of(new String(word, ss + 2, length - ss - 2));
               }
               return tails.flatMap(s -> Stream.of(prefix + "ss" + s, prefix + "ß" + s));
             }
           }
         }.replaceSS(0, 0);
     if (result == null) return true;

     String src = new String(word, 0, length);
     for (String s : result.collect(Collectors.toList())) {
       if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
         return false;
       }
     }
     return true;
   }

   boolean doStem(
       char[] word, int offset, int length, WordContext context, RootProcessor processor) {
     IntsRef forms = dictionary.lookupWord(word, offset, length);
     if (forms != null) {
       for (int i = 0; i < forms.length; i += formStep) {
         int entryId = forms.ints[forms.offset + i];
         // we can't add this form, it's a pseudostem requiring an affix
         if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
           continue;
         }
         if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE)
             && dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
           return false;
         }
         if (!isRootCompatibleWithContext(context, -1, entryId)) {
           continue;
         }
         if (!callProcessor(word, offset, length, processor, forms, i)) {
           return false;
         }
       }
     }
     return stem(
         word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
   }

   /**
    * Find the unique stem(s) of the provided word
    *
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
   public List<CharsRef> uniqueStems(char[] word, int length) {
     List<CharsRef> stems = stem(word, length);
     if (stems.size() < 2) {
       return stems;
     }
     CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
     List<CharsRef> deduped = new ArrayList<>();
     for (CharsRef s : stems) {
       if (!terms.contains(s)) {
         deduped.add(s);
         terms.add(s);
       }
     }
     return deduped;
   }

   interface RootProcessor {
     /**
      * @param stem the text of the found dictionary entry
      * @param formID internal id of the dictionary entry, e.g. to be used in {@link
      *     Dictionary#hasFlag(int, char)}
      * @param morphDataId the id of the custom morphological data (0 if none), to be used with
      *     {@link Dictionary#morphData}
      * @return whether the processing should be continued
      */
     boolean processRoot(CharsRef stem, int formID, int morphDataId);
   }

   private String stemException(int morphDataId) {
     if (morphDataId > 0) {
       String data = dictionary.morphData.get(morphDataId);
       int start = data.startsWith("st:") ? 0 : data.indexOf(" st:");
       if (start >= 0) {
         int nextSpace = data.indexOf(' ', start + 3);
         return data.substring(start + 3, nextSpace < 0 ? data.length() : nextSpace);
       }
     }
     return null;
   }

   private CharsRef newStem(CharsRef stem, int morphDataId) {
     String exception = stemException(morphDataId);

     if (dictionary.oconv != null) {
       StringBuilder scratchSegment = new StringBuilder();
       if (exception != null) {
         scratchSegment.append(exception);
       } else {
         scratchSegment.append(stem.chars, stem.offset, stem.length);
       }
       dictionary.oconv.applyMappings(scratchSegment);
       char[] cleaned = new char[scratchSegment.length()];
       scratchSegment.getChars(0, cleaned.length, cleaned, 0);
       return new CharsRef(cleaned, 0, cleaned.length);
     } else {
       if (exception != null) {
         return new CharsRef(exception);
       } else {
         return stem;
       }
     }
   }

   /**
    * Generates a list of stems for the provided word
    *
    * @param word Word to generate the stems for
    * @param previous previous affix that was removed (so we dont remove same one twice)
    * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
    *     affixes in this recursive step
    * @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
    *     checked against the word
    * @param recursionDepth current recursiondepth
    * @param doPrefix true if we should remove prefixes
    * @param previousWasPrefix true if the previous removal was a prefix: if we are removing a
    *     suffix, and it has no continuation requirements, it's ok. but two prefixes
    *     (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
    * @return whether the processing should be continued
    */
   private boolean stem(
       char[] word,
       int offset,
       int length,
       WordContext context,
       int previous,
       char prevFlag,
       int prefixId,
       int recursionDepth,
       boolean doPrefix,
       boolean previousWasPrefix,
       RootProcessor processor) {
     FST.Arc<IntsRef> arc = new FST.Arc<>();
     if (doPrefix && dictionary.prefixes != null) {
       FST<IntsRef> fst = dictionary.prefixes;
       FST.BytesReader reader = fst.getBytesReader();
       fst.getFirstArc(arc);
       IntsRef output = fst.outputs.getNoOutput();
       int limit = dictionary.fullStrip ? length + 1 : length;
       for (int i = 0; i < limit; i++) {
         if (i > 0) {
           output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i - 1]);
           if (output == null) {
             break;
           }
         }
         if (!arc.isFinal()) {
           continue;
         }
         IntsRef prefixes = fst.outputs.add(output, arc.nextFinalOutput());

         for (int j = 0; j < prefixes.length; j++) {
           int prefix = prefixes.ints[prefixes.offset + j];
           if (prefix == previous) {
             continue;
           }

           if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
             char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
             if (strippedWord == null) {
               continue;
             }

             boolean pureAffix = strippedWord == word;
             if (!applyAffix(
                 strippedWord,
                 pureAffix ? offset + i : 0,
                 pureAffix ? length - i : strippedWord.length,
                 context,
                 prefix,
                 previous,
                 -1,
                 recursionDepth,
                 true,
                 processor)) {
               return false;
             }
           }
         }
       }
     }

     if (dictionary.suffixes != null) {
       FST<IntsRef> fst = dictionary.suffixes;
       FST.BytesReader reader = fst.getBytesReader();
       fst.getFirstArc(arc);
       IntsRef output = fst.outputs.getNoOutput();
       int limit = dictionary.fullStrip ? 0 : 1;
       for (int i = length; i >= limit; i--) {
         if (i < length) {
           output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i]);
           if (output == null) {
             break;
           }
         }
         if (!arc.isFinal()) {
           continue;
         }
         IntsRef suffixes = fst.outputs.add(output, arc.nextFinalOutput());

         for (int j = 0; j < suffixes.length; j++) {
           int suffix = suffixes.ints[suffixes.offset + j];
           if (suffix == previous) {
             continue;
           }

           if (isAffixCompatible(
               suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
             char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
             if (strippedWord == null) {
               continue;
             }

             boolean pureAffix = strippedWord == word;
             if (!applyAffix(
                 strippedWord,
                 pureAffix ? offset : 0,
                 pureAffix ? i : strippedWord.length,
                 context,
                 suffix,
                 previous,
                 prefixId,
                 recursionDepth,
                 false,
                 processor)) {
               return false;
             }
           }
         }
       }
     }

     return true;
   }

   /**
    * @return null if affix conditions isn't met; a reference to the same char[] if the affix has no
    *     strip data and can thus be simply removed, or a new char[] containing the word affix
    *     removal
    */
   private char[] stripAffix(
       char[] word, int offset, int length, int affixLen, int affix, boolean isPrefix) {
     int deAffixedLen = length - affixLen;

     int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
     int stripStart = dictionary.stripOffsets[stripOrd];
     int stripEnd = dictionary.stripOffsets[stripOrd + 1];
     int stripLen = stripEnd - stripStart;

     if (stripLen + deAffixedLen == 0) return null;

     char[] stripData = dictionary.stripData;
     int condition = dictionary.getAffixCondition(affix);
     if (condition != 0) {
       int deAffixedOffset = isPrefix ? offset + affixLen : offset;
       if (!dictionary.patterns.get(condition).acceptsStem(word, deAffixedOffset, deAffixedLen)) {
         return null;
       }
     }

     if (stripLen == 0) return word;

     char[] strippedWord = new char[stripLen + deAffixedLen];
     System.arraycopy(
         word,
         offset + (isPrefix ? affixLen : 0),
         strippedWord,
         isPrefix ? stripLen : 0,
         deAffixedLen);
     System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
     return strippedWord;
   }

   private boolean isAffixCompatible(
       int affix,
       char prevFlag,
       int recursionDepth,
       boolean isPrefix,
       boolean previousWasPrefix,
       WordContext context) {
     int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);

     if (context.isCompound()) {
       if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
         return false;
       }
       if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
           && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
         return false;
       }
       if (context == WordContext.COMPOUND_END
           && !isPrefix
           && !previousWasPrefix
           && dictionary.hasFlag(append, dictionary.onlyincompound)) {
         return false;
       }
     } else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
       return false;
     }

     if (recursionDepth == 0) {
       return true;
     }

     if (dictionary.isCrossProduct(affix)) {
       // cross check incoming continuation class (flag of previous affix) against list.
       return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
     }

     return false;
   }

   /**
    * Applies the affix rule to the given word, producing a list of stems if any are found
    *
    * @param strippedWord Char array containing the word with the affix removed and the strip added
    * @param offset where the word actually starts in the array
    * @param length the length of the stripped word
    * @param affix HunspellAffix representing the affix rule itself
    * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
    *     suffix, unless both are compatible so we must check dictionary form against both to add it
    *     as a stem!
    * @param recursionDepth current recursion depth
    * @param prefix true if we are removing a prefix (false if it's a suffix)
    * @return whether the processing should be continued
    */
   private boolean applyAffix(
       char[] strippedWord,
       int offset,
       int length,
       WordContext context,
       int affix,
       int previousAffix,
       int prefixId,
       int recursionDepth,
       boolean prefix,
       RootProcessor processor) {
     char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);

     boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
     IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
     if (forms != null) {
       for (int i = 0; i < forms.length; i += formStep) {
         int entryId = forms.ints[forms.offset + i];
         if (dictionary.hasFlag(entryId, flag) || isFlagAppendedByAffix(prefixId, flag)) {
           // confusing: in this one exception, we already chained the first prefix against the
           // second,
           // so it doesnt need to be checked against the word
           boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
           if (!chainedPrefix && prefixId >= 0) {
             char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
             if (!dictionary.hasFlag(entryId, prefixFlag)
                 && !isFlagAppendedByAffix(affix, prefixFlag)) {
               continue;
             }
           }

           if (!isRootCompatibleWithContext(context, affix, entryId)) {
             continue;
           }

           if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
             return false;
           }
         }
       }
     }

     if (dictionary.isCrossProduct(affix) && recursionDepth <= 1) {
       boolean doPrefix;
       if (recursionDepth == 0) {
         if (prefix) {
           prefixId = affix;
           doPrefix = dictionary.complexPrefixes && dictionary.isSecondStagePrefix(flag);
           // we took away the first prefix.
           // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix
           // COMPLEXPREFIXES = false: combine with a suffix
         } else if (!dictionary.complexPrefixes && dictionary.isSecondStageSuffix(flag)) {
           doPrefix = false;
           // we took away a suffix.
           // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
           // COMPLEXPREFIXES = false: combine with another suffix
         } else {
           return true;
         }
       } else {
         doPrefix = false;
         if (prefix && dictionary.complexPrefixes) {
           prefixId = affix;
           // we took away the second prefix: go look for another suffix
         } else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageSuffix(flag)) {
           return true;
         }
         // we took away a prefix, then a suffix: go look for another suffix
       }

       return stem(
           strippedWord,
           offset,
           length,
           context,
           affix,
           flag,
           prefixId,
           recursionDepth + 1,
           doPrefix,
           prefix,
           processor);
     }

     return true;
   }

   private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
     if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
       return false;
     }
     if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
       char cFlag = context.requiredFlag(dictionary);
       return dictionary.hasFlag(entryId, cFlag)
           || isFlagAppendedByAffix(lastAffix, cFlag)
           || dictionary.hasFlag(entryId, dictionary.compoundFlag)
           || isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
     }
     return true;
   }

   private boolean callProcessor(
       char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
     CharsRef stem = new CharsRef(word, offset, length);
     int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
     return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId);
   }

   private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
     char circumfix = dictionary.circumfix;
     // if circumfix was previously set by a prefix, we must check this suffix,
     // to ensure it has it, and vice versa
     if (isSuffix
         && isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
       return true;
     }
     if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
       return !isSuffix
           || previousAffix < 0
           || isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
     }
     return false;
   }

   private boolean isFlagAppendedByAffix(int affixId, char flag) {
     if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false;
     int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
     return dictionary.hasFlag(appendId, flag);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.hunspell;

	import java.util.ArrayList;
	import java.util.List;
	import java.util.stream.Collectors;
	import java.util.stream.Stream;
	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.CharsRef;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.fst.FST;

	/**
	* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.
	* It conforms to the algorithm in the original hunspell algorithm, including recursive suffix
	* stripping.
	*/
	final class Stemmer {
	private final Dictionary dictionary;

	// it's '1' if we have no stem exceptions, otherwise every other form
	// is really an ID pointing to the exception table
	private final int formStep;

	/**
	* Constructs a new Stemmer which will use the provided Dictionary to create its stems.
	*
	* @param dictionary Dictionary that will be used to create the stems
	*/
	public Stemmer(Dictionary dictionary) {
	this.dictionary = dictionary;
	formStep = dictionary.formStep();
	}

	/**
	* Find the stem(s) of the provided word.
	*
	* @param word Word to find the stems for
	* @return List of stems for the word
	*/
	public List<CharsRef> stem(String word) {
	return stem(word.toCharArray(), word.length());
	}

	/**
	* Find the stem(s) of the provided word
	*
	* @param word Word to find the stems for
	* @return List of stems for the word
	*/
	public List<CharsRef> stem(char[] word, int length) {

	if (dictionary.mayNeedInputCleaning()) {
	CharsRef scratchSegment = new CharsRef(word, 0, length);
	if (dictionary.needsInputCleaning(scratchSegment)) {
	StringBuilder segment = new StringBuilder();
	dictionary.cleanInput(scratchSegment, segment);
	char[] scratchBuffer = new char[segment.length()];
	length = segment.length();
	segment.getChars(0, length, scratchBuffer, 0);
	word = scratchBuffer;
	}
	}

	List<CharsRef> list = new ArrayList<>();
	if (length == 0) {
	return list;
	}

	RootProcessor processor =
	(stem, formID, stemException) -> {
	list.add(newStem(stem, stemException));
	return true;
	};
	if (!doStem(word, 0, length, WordContext.SIMPLE_WORD, processor)) {
	return list;
	}

	WordCase wordCase = caseOf(word, length);
	if (wordCase == WordCase.UPPER \|\| wordCase == WordCase.TITLE) {
	CaseVariationProcessor variationProcessor =
	(variant, varLength, originalCase) ->
	doStem(variant, 0, varLength, WordContext.SIMPLE_WORD, processor);
	varyCase(word, length, wordCase, variationProcessor);
	}
	return list;
	}

	interface CaseVariationProcessor {
	boolean process(char[] word, int length, WordCase originalCase);
	}

	boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) {
	char[] titleBuffer = wordCase == WordCase.UPPER ? caseFoldTitle(word, length) : null;
	if (wordCase == WordCase.UPPER) {
	char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
	if (aposCase != null && !processor.process(aposCase, length, wordCase)) {
	return false;
	}
	if (!processor.process(titleBuffer, length, wordCase)) {
	return false;
	}
	if (dictionary.checkSharpS && !varySharpS(titleBuffer, length, processor)) {
	return false;
	}
	}

	if (dictionary.isDotICaseChangeDisallowed(word)) {
	return true;
	}

	char[] lowerBuffer = caseFoldLower(titleBuffer != null ? titleBuffer : word, length);
	if (!processor.process(lowerBuffer, length, wordCase)) {
	return false;
	}
	if (wordCase == WordCase.UPPER
	&& dictionary.checkSharpS
	&& !varySharpS(lowerBuffer, length, processor)) {
	return false;
	}
	return true;
	}

	/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
	WordCase caseOf(char[] word, int length) {
	if (dictionary.ignoreCase \|\| length == 0 \|\| Character.isLowerCase(word[0])) {
	return WordCase.MIXED;
	}

	return WordCase.caseOf(word, length);
	}

	/** folds titlecase variant of word to titleBuffer */
	private char[] caseFoldTitle(char[] word, int length) {
	char[] titleBuffer = new char[length];
	System.arraycopy(word, 0, titleBuffer, 0, length);
	for (int i = 1; i < length; i++) {
	titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
	}
	return titleBuffer;
	}

	/** folds lowercase variant of word (title cased) to lowerBuffer */
	private char[] caseFoldLower(char[] word, int length) {
	char[] lowerBuffer = new char[length];
	System.arraycopy(word, 0, lowerBuffer, 0, length);
	lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
	return lowerBuffer;
	}

	// Special prefix handling for Catalan, French, Italian:
	// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
	private static char[] capitalizeAfterApostrophe(char[] word, int length) {
	for (int i = 1; i < length - 1; i++) {
	if (word[i] == '\'') {
	char next = word[i + 1];
	char upper = Character.toUpperCase(next);
	if (upper != next) {
	char[] copy = ArrayUtil.copyOfSubArray(word, 0, length);
	copy[i + 1] = Character.toUpperCase(upper);
	return copy;
	}
	}
	}
	return null;
	}

	private boolean varySharpS(char[] word, int length, CaseVariationProcessor processor) {
	Stream<String> result =
	new Object() {
	int findSS(int start) {
	for (int i = start; i < length - 1; i++) {
	if (word[i] == 's' && word[i + 1] == 's') {
	return i;
	}
	}
	return -1;
	}

	Stream<String> replaceSS(int start, int depth) {
	if (depth > 5) { // cut off too large enumeration
	return Stream.of(new String(word, start, length - start));
	}

	int ss = findSS(start);
	if (ss < 0) {
	return null;
	} else {
	String prefix = new String(word, start, ss - start);
	Stream<String> tails = replaceSS(ss + 2, depth + 1);
	if (tails == null) {
	tails = Stream.of(new String(word, ss + 2, length - ss - 2));
	}
	return tails.flatMap(s -> Stream.of(prefix + "ss" + s, prefix + "ß" + s));
	}
	}
	}.replaceSS(0, 0);
	if (result == null) return true;

	String src = new String(word, 0, length);
	for (String s : result.collect(Collectors.toList())) {
	if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
	return false;
	}
	}
	return true;
	}

	boolean doStem(
	char[] word, int offset, int length, WordContext context, RootProcessor processor) {
	IntsRef forms = dictionary.lookupWord(word, offset, length);
	if (forms != null) {
	for (int i = 0; i < forms.length; i += formStep) {
	int entryId = forms.ints[forms.offset + i];
	// we can't add this form, it's a pseudostem requiring an affix
	if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
	continue;
	}
	if ((context == WordContext.COMPOUND_BEGIN \|\| context == WordContext.COMPOUND_MIDDLE)
	&& dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
	return false;
	}
	if (!isRootCompatibleWithContext(context, -1, entryId)) {
	continue;
	}
	if (!callProcessor(word, offset, length, processor, forms, i)) {
	return false;
	}
	}
	}
	return stem(
	word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
	}

	/**
	* Find the unique stem(s) of the provided word
	*
	* @param word Word to find the stems for
	* @return List of stems for the word
	*/
	public List<CharsRef> uniqueStems(char[] word, int length) {
	List<CharsRef> stems = stem(word, length);
	if (stems.size() < 2) {
	return stems;
	}
	CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
	List<CharsRef> deduped = new ArrayList<>();
	for (CharsRef s : stems) {
	if (!terms.contains(s)) {
	deduped.add(s);
	terms.add(s);
	}
	}
	return deduped;
	}

	interface RootProcessor {
	/**
	* @param stem the text of the found dictionary entry
	* @param formID internal id of the dictionary entry, e.g. to be used in {@link
	* Dictionary#hasFlag(int, char)}
	* @param morphDataId the id of the custom morphological data (0 if none), to be used with
	* {@link Dictionary#morphData}
	* @return whether the processing should be continued
	*/
	boolean processRoot(CharsRef stem, int formID, int morphDataId);
	}

	private String stemException(int morphDataId) {
	if (morphDataId > 0) {
	String data = dictionary.morphData.get(morphDataId);
	int start = data.startsWith("st:") ? 0 : data.indexOf(" st:");
	if (start >= 0) {
	int nextSpace = data.indexOf(' ', start + 3);
	return data.substring(start + 3, nextSpace < 0 ? data.length() : nextSpace);
	}
	}
	return null;
	}

	private CharsRef newStem(CharsRef stem, int morphDataId) {
	String exception = stemException(morphDataId);

	if (dictionary.oconv != null) {
	StringBuilder scratchSegment = new StringBuilder();
	if (exception != null) {
	scratchSegment.append(exception);
	} else {
	scratchSegment.append(stem.chars, stem.offset, stem.length);
	}
	dictionary.oconv.applyMappings(scratchSegment);
	char[] cleaned = new char[scratchSegment.length()];
	scratchSegment.getChars(0, cleaned.length, cleaned, 0);
	return new CharsRef(cleaned, 0, cleaned.length);
	} else {
	if (exception != null) {
	return new CharsRef(exception);
	} else {
	return stem;
	}
	}
	}

	/**
	* Generates a list of stems for the provided word
	*
	* @param word Word to generate the stems for
	* @param previous previous affix that was removed (so we dont remove same one twice)
	* @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
	* affixes in this recursive step
	* @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
	* checked against the word
	* @param recursionDepth current recursiondepth
	* @param doPrefix true if we should remove prefixes
	* @param previousWasPrefix true if the previous removal was a prefix: if we are removing a
	* suffix, and it has no continuation requirements, it's ok. but two prefixes
	* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
	* @return whether the processing should be continued
	*/
	private boolean stem(
	char[] word,
	int offset,
	int length,
	WordContext context,
	int previous,
	char prevFlag,
	int prefixId,
	int recursionDepth,
	boolean doPrefix,
	boolean previousWasPrefix,
	RootProcessor processor) {
	FST.Arc<IntsRef> arc = new FST.Arc<>();
	if (doPrefix && dictionary.prefixes != null) {
	FST<IntsRef> fst = dictionary.prefixes;
	FST.BytesReader reader = fst.getBytesReader();
	fst.getFirstArc(arc);
	IntsRef output = fst.outputs.getNoOutput();
	int limit = dictionary.fullStrip ? length + 1 : length;
	for (int i = 0; i < limit; i++) {
	if (i > 0) {
	output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i - 1]);
	if (output == null) {
	break;
	}
	}
	if (!arc.isFinal()) {
	continue;
	}
	IntsRef prefixes = fst.outputs.add(output, arc.nextFinalOutput());

	for (int j = 0; j < prefixes.length; j++) {
	int prefix = prefixes.ints[prefixes.offset + j];
	if (prefix == previous) {
	continue;
	}

	if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
	char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
	if (strippedWord == null) {
	continue;
	}

	boolean pureAffix = strippedWord == word;
	if (!applyAffix(
	strippedWord,
	pureAffix ? offset + i : 0,
	pureAffix ? length - i : strippedWord.length,
	context,
	prefix,
	previous,
	-1,
	recursionDepth,
	true,
	processor)) {
	return false;
	}
	}
	}
	}
	}

	if (dictionary.suffixes != null) {
	FST<IntsRef> fst = dictionary.suffixes;
	FST.BytesReader reader = fst.getBytesReader();
	fst.getFirstArc(arc);
	IntsRef output = fst.outputs.getNoOutput();
	int limit = dictionary.fullStrip ? 0 : 1;
	for (int i = length; i >= limit; i--) {
	if (i < length) {
	output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i]);
	if (output == null) {
	break;
	}
	}
	if (!arc.isFinal()) {
	continue;
	}
	IntsRef suffixes = fst.outputs.add(output, arc.nextFinalOutput());

	for (int j = 0; j < suffixes.length; j++) {
	int suffix = suffixes.ints[suffixes.offset + j];
	if (suffix == previous) {
	continue;
	}

	if (isAffixCompatible(
	suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
	char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
	if (strippedWord == null) {
	continue;
	}

	boolean pureAffix = strippedWord == word;
	if (!applyAffix(
	strippedWord,
	pureAffix ? offset : 0,
	pureAffix ? i : strippedWord.length,
	context,
	suffix,
	previous,
	prefixId,
	recursionDepth,
	false,
	processor)) {
	return false;
	}
	}
	}
	}
	}

	return true;
	}

	/**
	* @return null if affix conditions isn't met; a reference to the same char[] if the affix has no
	* strip data and can thus be simply removed, or a new char[] containing the word affix
	* removal
	*/
	private char[] stripAffix(
	char[] word, int offset, int length, int affixLen, int affix, boolean isPrefix) {
	int deAffixedLen = length - affixLen;

	int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
	int stripStart = dictionary.stripOffsets[stripOrd];
	int stripEnd = dictionary.stripOffsets[stripOrd + 1];
	int stripLen = stripEnd - stripStart;

	if (stripLen + deAffixedLen == 0) return null;

	char[] stripData = dictionary.stripData;
	int condition = dictionary.getAffixCondition(affix);
	if (condition != 0) {
	int deAffixedOffset = isPrefix ? offset + affixLen : offset;
	if (!dictionary.patterns.get(condition).acceptsStem(word, deAffixedOffset, deAffixedLen)) {
	return null;
	}
	}

	if (stripLen == 0) return word;

	char[] strippedWord = new char[stripLen + deAffixedLen];
	System.arraycopy(
	word,
	offset + (isPrefix ? affixLen : 0),
	strippedWord,
	isPrefix ? stripLen : 0,
	deAffixedLen);
	System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
	return strippedWord;
	}

	private boolean isAffixCompatible(
	int affix,
	char prevFlag,
	int recursionDepth,
	boolean isPrefix,
	boolean previousWasPrefix,
	WordContext context) {
	int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);

	if (context.isCompound()) {
	if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
	return false;
	}
	if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
	&& !dictionary.hasFlag(append, dictionary.compoundPermit)) {
	return false;
	}
	if (context == WordContext.COMPOUND_END
	&& !isPrefix
	&& !previousWasPrefix
	&& dictionary.hasFlag(append, dictionary.onlyincompound)) {
	return false;
	}
	} else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
	return false;
	}

	if (recursionDepth == 0) {
	return true;
	}

	if (dictionary.isCrossProduct(affix)) {
	// cross check incoming continuation class (flag of previous affix) against list.
	return previousWasPrefix \|\| dictionary.hasFlag(append, prevFlag);
	}

	return false;
	}

	/**
	* Applies the affix rule to the given word, producing a list of stems if any are found
	*
	* @param strippedWord Char array containing the word with the affix removed and the strip added
	* @param offset where the word actually starts in the array
	* @param length the length of the stripped word
	* @param affix HunspellAffix representing the affix rule itself
	* @param prefixId when we already stripped a prefix, we can't simply recurse and check the
	* suffix, unless both are compatible so we must check dictionary form against both to add it
	* as a stem!
	* @param recursionDepth current recursion depth
	* @param prefix true if we are removing a prefix (false if it's a suffix)
	* @return whether the processing should be continued
	*/
	private boolean applyAffix(
	char[] strippedWord,
	int offset,
	int length,
	WordContext context,
	int affix,
	int previousAffix,
	int prefixId,
	int recursionDepth,
	boolean prefix,
	RootProcessor processor) {
	char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);

	boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
	IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
	if (forms != null) {
	for (int i = 0; i < forms.length; i += formStep) {
	int entryId = forms.ints[forms.offset + i];
	if (dictionary.hasFlag(entryId, flag) \|\| isFlagAppendedByAffix(prefixId, flag)) {
	// confusing: in this one exception, we already chained the first prefix against the
	// second,
	// so it doesnt need to be checked against the word
	boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
	if (!chainedPrefix && prefixId >= 0) {
	char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
	if (!dictionary.hasFlag(entryId, prefixFlag)
	&& !isFlagAppendedByAffix(affix, prefixFlag)) {
	continue;
	}
	}

	if (!isRootCompatibleWithContext(context, affix, entryId)) {
	continue;
	}

	if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
	return false;
	}
	}
	}
	}

	if (dictionary.isCrossProduct(affix) && recursionDepth <= 1) {
	boolean doPrefix;
	if (recursionDepth == 0) {
	if (prefix) {
	prefixId = affix;
	doPrefix = dictionary.complexPrefixes && dictionary.isSecondStagePrefix(flag);
	// we took away the first prefix.
	// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
	// COMPLEXPREFIXES = false: combine with a suffix
	} else if (!dictionary.complexPrefixes && dictionary.isSecondStageSuffix(flag)) {
	doPrefix = false;
	// we took away a suffix.
	// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
	// COMPLEXPREFIXES = false: combine with another suffix
	} else {
	return true;
	}
	} else {
	doPrefix = false;
	if (prefix && dictionary.complexPrefixes) {
	prefixId = affix;
	// we took away the second prefix: go look for another suffix
	} else if (prefix \|\| dictionary.complexPrefixes \|\| !dictionary.isSecondStageSuffix(flag)) {
	return true;
	}
	// we took away a prefix, then a suffix: go look for another suffix
	}

	return stem(
	strippedWord,
	offset,
	length,
	context,
	affix,
	flag,
	prefixId,
	recursionDepth + 1,
	doPrefix,
	prefix,
	processor);
	}

	return true;
	}

	private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
	if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
	return false;
	}
	if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
	char cFlag = context.requiredFlag(dictionary);
	return dictionary.hasFlag(entryId, cFlag)
	\|\| isFlagAppendedByAffix(lastAffix, cFlag)
	\|\| dictionary.hasFlag(entryId, dictionary.compoundFlag)
	\|\| isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
	}
	return true;
	}

	private boolean callProcessor(
	char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
	CharsRef stem = new CharsRef(word, offset, length);
	int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
	return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId);
	}

	private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
	char circumfix = dictionary.circumfix;
	// if circumfix was previously set by a prefix, we must check this suffix,
	// to ensure it has it, and vice versa
	if (isSuffix
	&& isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
	return true;
	}
	if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
	return !isSuffix
	\|\| previousAffix < 0
	\|\| isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
	}
	return false;
	}

	private boolean isFlagAppendedByAffix(int affixId, char flag) {
	if (affixId < 0 \|\| flag == Dictionary.FLAG_UNSET) return false;
	int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
	return dictionary.hasFlag(appendId, flag);
	}
	}