lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.hunspell;

 import static org.apache.lucene.analysis.hunspell.AffixKind.*;

 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.stream.Collectors;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
 import org.apache.lucene.util.fst.Util;

 /** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
 public class Dictionary {
   // Derived from woorm/LibreOffice dictionaries.
   // See TestAllDictionaries.testMaxPrologueNeeded.
   static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024;

   static final char[] NOFLAGS = new char[0];

   static final char FLAG_UNSET = (char) 0;
   private static final int DEFAULT_FLAGS = 65510;
   static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell

   static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
   CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);

   FST<IntsRef> prefixes;
   FST<IntsRef> suffixes;
   Breaks breaks = Breaks.DEFAULT;

   /**
    * All condition checks used by prefixes and suffixes. these are typically re-used across many
    * affix stripping rules. so these are deduplicated, to save RAM.
    */
   ArrayList<AffixCondition> patterns = new ArrayList<>();

   /**
    * The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list
    * for flagLookup.
    */
   FST<IntsRef> words;

   /** A Bloom filter over {@link #words} to avoid unnecessary expensive FST traversals */
   FixedBitSet wordHashes;

   /**
    * The list of unique flagsets (wordforms). theoretically huge, but practically small (for Polish
    * this is 756), otherwise humans wouldn't be able to deal with it either.
    */
   final FlagEnumerator.Lookup flagLookup;

   // the list of unique strip affixes.
   char[] stripData;
   int[] stripOffsets;

   String wordChars = "";

   // 4 chars per affix, each char representing an unsigned 2-byte integer
   char[] affixData = new char[32];
   private int currentAffix = 0;

   // offsets in affixData
   static final int AFFIX_FLAG = 0;
   static final int AFFIX_STRIP_ORD = 1;
   private static final int AFFIX_CONDITION = 2;
   static final int AFFIX_APPEND = 3;

   // Default flag parsing strategy
   FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();

   // AF entries
   private String[] aliases;
   private int aliasCount = 0;

   // AM entries
   private String[] morphAliases;
   private int morphAliasCount = 0;

   final List<String> morphData = new ArrayList<>(Collections.singletonList("")); // empty data at 0

   /**
    * we set this during sorting, so we know to add an extra int (index in {@link #morphData}) to FST
    * output
    */
   boolean hasCustomMorphData;

   boolean ignoreCase;
   boolean checkSharpS;
   boolean complexPrefixes;

   /**
    * All flags used in affix continuation classes. If an outer affix's flag isn't here, there's no
    * need to do 2-level affix stripping with it.
    */
   private char[] secondStagePrefixFlags, secondStageSuffixFlags;

   char circumfix;
   char keepcase, forceUCase;
   char needaffix;
   char forbiddenword;
   char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundFlag;
   char compoundPermit, compoundForbid;
   boolean checkCompoundCase, checkCompoundDup, checkCompoundRep;
   boolean checkCompoundTriple, simplifiedTriple;
   int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
   List<CompoundRule> compoundRules; // nullable
   List<CheckCompoundPattern> checkCompoundPatterns = new ArrayList<>();

   // ignored characters (dictionary, affix, inputs)
   private char[] ignore;

   String tryChars = "";
   String[] neighborKeyGroups = {"qwertyuiop", "asdfghjkl", "zxcvbnm"};
   boolean enableSplitSuggestions = true;
   List<RepEntry> repTable = new ArrayList<>();
   List<List<String>> mapTable = new ArrayList<>();
   int maxDiff = 5;
   int maxNGramSuggestions = 4;
   boolean onlyMaxDiff;
   char noSuggest, subStandard;
   ConvTable iconv, oconv;

   // true if we can strip suffixes "down to nothing"
   boolean fullStrip;

   // language declaration of the dictionary
   String language;
   // true if case algorithms should use alternate (Turkish/Azeri) mapping
   private boolean alternateCasing;

   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to
    * hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
    *
    * @param tempDir Directory to use for offline sorting
    * @param tempFileNamePrefix prefix to use to generate temp file names
    * @param affix InputStream for reading the hunspell affix file (won't be closed).
    * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
    * @throws IOException Can be thrown while reading from the InputStreams
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public Dictionary(
       Directory tempDir, String tempFileNamePrefix, InputStream affix, InputStream dictionary)
       throws IOException, ParseException {
     this(tempDir, tempFileNamePrefix, affix, Collections.singletonList(dictionary), false);
   }

   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to
    * hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
    *
    * @param tempDir Directory to use for offline sorting
    * @param tempFileNamePrefix prefix to use to generate temp file names
    * @param affix InputStream for reading the hunspell affix file (won't be closed).
    * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
    * @throws IOException Can be thrown while reading from the InputStreams
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public Dictionary(
       Directory tempDir,
       String tempFileNamePrefix,
       InputStream affix,
       List<InputStream> dictionaries,
       boolean ignoreCase)
       throws IOException, ParseException {
     this.ignoreCase = ignoreCase;

     try (BufferedInputStream affixStream =
         new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
           @Override
           public void close() {
             // TODO: maybe we should consume and close it? Why does it need to stay open?
             // Don't close the affix stream as per javadoc.
           }
         }) {
       // I assume we don't support other BOMs (utf16, etc.)? We trivially could,
       // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
       // any such exotic examples.
       Charset streamCharset;
       if (maybeConsume(affixStream, BOM_UTF8)) {
         streamCharset = StandardCharsets.UTF_8;
       } else {
         streamCharset = DEFAULT_CHARSET;
       }

       /*
        * pass 1: look for encoding & flag. This is simple but works. We just prefetch
        * a large enough chunk of the input and scan through it. The buffered data will
        * be subsequently reused anyway so nothing is wasted.
        */
       affixStream.mark(MAX_PROLOGUE_SCAN_WINDOW);
       byte[] prologue = affixStream.readNBytes(MAX_PROLOGUE_SCAN_WINDOW - 1);
       affixStream.reset();
       readConfig(new ByteArrayInputStream(prologue), streamCharset);

       // pass 2: parse affixes
       FlagEnumerator flagEnumerator = new FlagEnumerator();
       readAffixFile(affixStream, decoder, flagEnumerator);

       // read dictionary entries
       IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
       int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
       wordHashes = new FixedBitSet(Integer.highestOneBit(wordCount * 10));
       String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
       words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator);
       flagLookup = flagEnumerator.finish();
       aliases = null; // no longer needed
       morphAliases = null; // no longer needed
     }
   }

   int formStep() {
     return hasCustomMorphData ? 2 : 1;
   }

   /** Looks up Hunspell word forms from the dictionary */
   IntsRef lookupWord(char[] word, int offset, int length) {
     int hash = CharsRef.stringHashCode(word, offset, length);
     if (!wordHashes.get(Math.abs(hash) % wordHashes.length())) {
       return null;
     }

     return lookup(words, word, offset, length);
   }

   // only for testing
   IntsRef lookupPrefix(char[] word) {
     return lookup(prefixes, word, 0, word.length);
   }

   // only for testing
   IntsRef lookupSuffix(char[] word) {
     return lookup(suffixes, word, 0, word.length);
   }

   IntsRef lookup(FST<IntsRef> fst, char[] word, int offset, int length) {
     if (fst == null) {
       return null;
     }
     final FST.BytesReader bytesReader = fst.getBytesReader();
     final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
     // Accumulate output as we go
     IntsRef output = fst.outputs.getNoOutput();

     int l = offset + length;
     for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
       cp = Character.codePointAt(word, i, l);
       output = nextArc(fst, arc, bytesReader, output, cp);
       if (output == null) {
         return null;
       }
     }
     return nextArc(fst, arc, bytesReader, output, FST.END_LABEL);
   }

   static IntsRef nextArc(
       FST<IntsRef> fst, FST.Arc<IntsRef> arc, FST.BytesReader reader, IntsRef output, int ch) {
     try {
       if (fst.findTargetArc(ch, arc, arc, reader) == null) {
         return null;
       }
     } catch (IOException bogus) {
       throw new RuntimeException(bogus);
     }
     return fst.outputs.add(output, arc.output());
   }

   /**
    * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
    *
    * @param affixStream InputStream to read the content of the affix file from
    * @param decoder CharsetDecoder to decode the content of the file
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, FlagEnumerator flags)
       throws IOException, ParseException {
     TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
     TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
     Set<Character> prefixContFlags = new HashSet<>();
     Set<Character> suffixContFlags = new HashSet<>();
     Map<String, Integer> seenPatterns = new HashMap<>();

     // zero condition -> 0 ord
     seenPatterns.put(AffixCondition.ALWAYS_TRUE_KEY, 0);
     patterns.add(null);

     // zero strip -> 0 ord
     Map<String, Integer> seenStrips = new LinkedHashMap<>();
     seenStrips.put("", 0);

     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
     String line;
     while ((line = reader.readLine()) != null) {
       // ignore any BOM marker on first line
       if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
         line = line.substring(1);
       }
       line = line.trim();
       if (line.isEmpty()) continue;

       String firstWord = line.split("\\s")[0];
       // TODO: convert to a switch?
       if ("AF".equals(firstWord)) {
         parseAlias(line);
       } else if ("AM".equals(firstWord)) {
         parseMorphAlias(line);
       } else if ("PFX".equals(firstWord)) {
         parseAffix(
             prefixes, prefixContFlags, line, reader, PREFIX, seenPatterns, seenStrips, flags);
       } else if ("SFX".equals(firstWord)) {
         parseAffix(
             suffixes, suffixContFlags, line, reader, SUFFIX, seenPatterns, seenStrips, flags);
       } else if (line.equals("COMPLEXPREFIXES")) {
         complexPrefixes =
             true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
       } else if ("CIRCUMFIX".equals(firstWord)) {
         circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("KEEPCASE".equals(firstWord)) {
         keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("FORCEUCASE".equals(firstWord)) {
         forceUCase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
         needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("ONLYINCOMPOUND".equals(firstWord)) {
         onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("CHECKSHARPS".equals(firstWord)) {
         checkSharpS = true;
       } else if ("IGNORE".equals(firstWord)) {
         ignore = singleArgument(reader, line).toCharArray();
         Arrays.sort(ignore);
       } else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
         int num = parseNum(reader, line);
         ConvTable res = parseConversions(reader, num);
         if (line.startsWith("I")) {
           iconv = res;
         } else {
           oconv = res;
         }
       } else if ("FULLSTRIP".equals(firstWord)) {
         fullStrip = true;
       } else if ("LANG".equals(firstWord)) {
         language = singleArgument(reader, line);
         this.alternateCasing = hasLanguage("tr", "az");
       } else if ("BREAK".equals(firstWord)) {
         breaks = parseBreaks(reader, line);
       } else if ("WORDCHARS".equals(firstWord)) {
         wordChars = firstArgument(reader, line);
       } else if ("TRY".equals(firstWord)) {
         tryChars = firstArgument(reader, line);
       } else if ("REP".equals(firstWord)) {
         int count = parseNum(reader, line);
         for (int i = 0; i < count; i++) {
           String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE);
           repTable.add(new RepEntry(parts[1], parts[2]));
         }
       } else if ("MAP".equals(firstWord)) {
         int count = parseNum(reader, line);
         for (int i = 0; i < count; i++) {
           mapTable.add(parseMapEntry(reader, reader.readLine()));
         }
       } else if ("KEY".equals(firstWord)) {
         neighborKeyGroups = singleArgument(reader, line).split("\\|");
       } else if ("NOSPLITSUGS".equals(firstWord)) {
         enableSplitSuggestions = false;
       } else if ("MAXNGRAMSUGS".equals(firstWord)) {
         maxNGramSuggestions = Integer.parseInt(singleArgument(reader, line));
       } else if ("MAXDIFF".equals(firstWord)) {
         int i = Integer.parseInt(singleArgument(reader, line));
         if (i < 0 || i > 10) {
           throw new ParseException("MAXDIFF should be between 0 and 10", reader.getLineNumber());
         }
         maxDiff = i;
       } else if ("ONLYMAXDIFF".equals(firstWord)) {
         onlyMaxDiff = true;
       } else if ("FORBIDDENWORD".equals(firstWord)) {
         forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("NOSUGGEST".equals(firstWord)) {
         noSuggest = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("SUBSTANDARD".equals(firstWord)) {
         subStandard = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("COMPOUNDMIN".equals(firstWord)) {
         compoundMin = Math.max(1, parseNum(reader, line));
       } else if ("COMPOUNDWORDMAX".equals(firstWord)) {
         compoundMax = Math.max(1, parseNum(reader, line));
       } else if ("COMPOUNDRULE".equals(firstWord)) {
         compoundRules = parseCompoundRules(reader, parseNum(reader, line));
       } else if ("COMPOUNDFLAG".equals(firstWord)) {
         compoundFlag = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("COMPOUNDBEGIN".equals(firstWord)) {
         compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("COMPOUNDMIDDLE".equals(firstWord)) {
         compoundMiddle = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("COMPOUNDEND".equals(firstWord)) {
         compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
         compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("COMPOUNDFORBIDFLAG".equals(firstWord)) {
         compoundForbid = flagParsingStrategy.parseFlag(singleArgument(reader, line));
       } else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
         checkCompoundCase = true;
       } else if ("CHECKCOMPOUNDDUP".equals(firstWord)) {
         checkCompoundDup = true;
       } else if ("CHECKCOMPOUNDREP".equals(firstWord)) {
         checkCompoundRep = true;
       } else if ("CHECKCOMPOUNDTRIPLE".equals(firstWord)) {
         checkCompoundTriple = true;
       } else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) {
         simplifiedTriple = true;
       } else if ("CHECKCOMPOUNDPATTERN".equals(firstWord)) {
         int count = parseNum(reader, line);
         for (int i = 0; i < count; i++) {
           checkCompoundPatterns.add(
               new CheckCompoundPattern(reader.readLine(), flagParsingStrategy, this));
         }
       } else if ("SET".equals(firstWord)) {
         checkCriticalDirectiveSame(
             "SET", reader, decoder.charset(), getDecoder(singleArgument(reader, line)).charset());
       } else if ("FLAG".equals(firstWord)) {
         FlagParsingStrategy strategy = getFlagParsingStrategy(line, decoder.charset());
         checkCriticalDirectiveSame(
             "FLAG", reader, flagParsingStrategy.getClass(), strategy.getClass());
       }
     }

     this.prefixes = affixFST(prefixes);
     this.suffixes = affixFST(suffixes);
     secondStagePrefixFlags = toSortedCharArray(prefixContFlags);
     secondStageSuffixFlags = toSortedCharArray(suffixContFlags);

     int totalChars = 0;
     for (String strip : seenStrips.keySet()) {
       totalChars += strip.length();
     }
     stripData = new char[totalChars];
     stripOffsets = new int[seenStrips.size() + 1];
     int currentOffset = 0;
     int currentIndex = 0;
     for (String strip : seenStrips.keySet()) {
       stripOffsets[currentIndex++] = currentOffset;
       strip.getChars(0, strip.length(), stripData, currentOffset);
       currentOffset += strip.length();
     }
     assert currentIndex == seenStrips.size();
     stripOffsets[currentIndex] = currentOffset;
   }

   private void checkCriticalDirectiveSame(
       String directive, LineNumberReader reader, Object expected, Object actual)
       throws ParseException {
     if (!expected.equals(actual)) {
       throw new ParseException(
           directive
               + " directive should occur at most once, and in the first "
               + MAX_PROLOGUE_SCAN_WINDOW
               + " bytes of the *.aff file",
           reader.getLineNumber());
     }
   }

   private List<String> parseMapEntry(LineNumberReader reader, String line) throws ParseException {
     String unparsed = firstArgument(reader, line);
     List<String> mapEntry = new ArrayList<>();
     for (int j = 0; j < unparsed.length(); j++) {
       if (unparsed.charAt(j) == '(') {
         int closing = unparsed.indexOf(')', j);
         if (closing < 0) {
           throw new ParseException("Unclosed parenthesis: " + line, reader.getLineNumber());
         }

         mapEntry.add(unparsed.substring(j + 1, closing));
         j = closing;
       } else {
         mapEntry.add(String.valueOf(unparsed.charAt(j)));
       }
     }
     return mapEntry;
   }

   boolean hasLanguage(String... langCodes) {
     if (language == null) return false;
     String langCode = extractLanguageCode(language);
     for (String code : langCodes) {
       if (langCode.equals(code)) {
         return true;
       }
     }
     return false;
   }

   /**
    * @param root a string to look up in the dictionary. No case conversion or affix removal is
    *     performed. To get the possible roots of any word, you may call {@link
    *     Hunspell#getRoots(String)}
    * @return the dictionary entries for the given root, or {@code null} if there's none
    */
   public DictEntries lookupEntries(String root) {
     IntsRef forms = lookupWord(root.toCharArray(), 0, root.length());
     if (forms == null) return null;

     return new DictEntries() {
       @Override
       public int size() {
         return forms.length / (hasCustomMorphData ? 2 : 1);
       }

       @Override
       public String getMorphologicalData(int entryIndex) {
         if (!hasCustomMorphData) return "";
         return morphData.get(forms.ints[forms.offset + entryIndex * 2 + 1]);
       }

       @Override
       public List<String> getMorphologicalValues(int entryIndex, String key) {
         assert key.length() == 3;
         assert key.charAt(2) == ':';

         String fields = getMorphologicalData(entryIndex);
         if (fields.isEmpty() || !fields.contains(key)) return Collections.emptyList();

         return Arrays.stream(fields.split(" "))
             .filter(s -> s.startsWith(key))
             .map(s -> s.substring(3))
             .collect(Collectors.toList());
       }
     };
   }

   static String extractLanguageCode(String isoCode) {
     int underscore = isoCode.indexOf("_");
     return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
   }

   private int parseNum(LineNumberReader reader, String line) throws ParseException {
     return Integer.parseInt(splitBySpace(reader, line, 2, Integer.MAX_VALUE)[1]);
   }

   private String singleArgument(LineNumberReader reader, String line) throws ParseException {
     return splitBySpace(reader, line, 2)[1];
   }

   private String firstArgument(LineNumberReader reader, String line) throws ParseException {
     return splitBySpace(reader, line, 2, Integer.MAX_VALUE)[1];
   }

   private String[] splitBySpace(LineNumberReader reader, String line, int expectedParts)
       throws ParseException {
     return splitBySpace(reader, line, expectedParts, expectedParts);
   }

   private String[] splitBySpace(LineNumberReader reader, String line, int minParts, int maxParts)
       throws ParseException {
     String[] parts = line.split("\\s+");
     if (parts.length < minParts || parts.length > maxParts && !parts[maxParts].startsWith("#")) {
       throw new ParseException("Invalid syntax: " + line, reader.getLineNumber());
     }
     return parts;
   }

   private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
       throws IOException, ParseException {
     List<CompoundRule> compoundRules = new ArrayList<>();
     for (int i = 0; i < num; i++) {
       compoundRules.add(new CompoundRule(singleArgument(reader, reader.readLine()), this));
     }
     return compoundRules;
   }

   private Breaks parseBreaks(LineNumberReader reader, String line)
       throws IOException, ParseException {
     Set<String> starting = new LinkedHashSet<>();
     Set<String> ending = new LinkedHashSet<>();
     Set<String> middle = new LinkedHashSet<>();
     int num = parseNum(reader, line);
     for (int i = 0; i < num; i++) {
       String breakStr = singleArgument(reader, reader.readLine());
       if (breakStr.startsWith("^")) {
         starting.add(breakStr.substring(1));
       } else if (breakStr.endsWith("$")) {
         ending.add(breakStr.substring(0, breakStr.length() - 1));
       } else {
         middle.add(breakStr);
       }
     }
     return new Breaks(starting, ending, middle);
   }

   private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
     IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
     FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
     IntsRefBuilder scratch = new IntsRefBuilder();
     for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
       Util.toUTF32(entry.getKey(), scratch);
       List<Integer> entries = entry.getValue();
       IntsRef output = new IntsRef(entries.size());
       for (Integer c : entries) {
         output.ints[output.length++] = c;
       }
       fstCompiler.add(scratch.get(), output);
     }
     return fstCompiler.compile();
   }

   /**
    * Parses a specific affix rule putting the result into the provided affix map
    *
    * @param affixes Map where the result of the parsing will be put
    * @param header Header line of the affix rule
    * @param reader BufferedReader to read the content of the rule from
    * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
   private void parseAffix(
       TreeMap<String, List<Integer>> affixes,
       Set<Character> secondStageFlags,
       String header,
       LineNumberReader reader,
       AffixKind kind,
       Map<String, Integer> seenPatterns,
       Map<String, Integer> seenStrips,
       FlagEnumerator flags)
       throws IOException, ParseException {

     StringBuilder sb = new StringBuilder();
     String[] args = header.split("\\s+");

     boolean crossProduct = args[2].equals("Y");

     int numLines;
     try {
       numLines = Integer.parseInt(args[3]);
     } catch (NumberFormatException e) {
       return;
     }
     affixData = ArrayUtil.grow(affixData, currentAffix * 4 + numLines * 4);

     for (int i = 0; i < numLines; i++) {
       String line = reader.readLine();
       // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
       String[] ruleArgs = splitBySpace(reader, line, 4, Integer.MAX_VALUE);

       char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
       String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
       String affixArg = ruleArgs[3];
       char[] appendFlags = null;

       // first: parse continuation classes out of affix
       int flagSep = affixArg.lastIndexOf('/');
       if (flagSep != -1) {
         String flagPart = affixArg.substring(flagSep + 1);
         affixArg = affixArg.substring(0, flagSep);

         if (aliasCount > 0) {
           flagPart = getAliasValue(Integer.parseInt(flagPart));
         }

         appendFlags = flagParsingStrategy.parseFlags(flagPart);
         for (char appendFlag : appendFlags) {
           secondStageFlags.add(appendFlag);
         }
       }
       // zero affix -> empty string
       if ("0".equals(affixArg)) {
         affixArg = "";
       }

       String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
       String key = AffixCondition.uniqueKey(kind, strip, condition);

       // deduplicate patterns
       Integer patternIndex = seenPatterns.get(key);
       if (patternIndex == null) {
         patternIndex = patterns.size();
         if (patternIndex > Short.MAX_VALUE) {
           throw new UnsupportedOperationException(
               "Too many patterns, please report this to dev@lucene.apache.org");
         }
         seenPatterns.put(key, patternIndex);
         patterns.add(AffixCondition.compile(kind, strip, condition, line));
       }

       Integer stripOrd = seenStrips.get(strip);
       if (stripOrd == null) {
         stripOrd = seenStrips.size();
         seenStrips.put(strip, stripOrd);
         if (stripOrd > Character.MAX_VALUE) {
           throw new UnsupportedOperationException(
               "Too many unique strips, please report this to dev@lucene.apache.org");
         }
       }

       if (appendFlags == null) {
         appendFlags = NOFLAGS;
       }

       int appendFlagsOrd = flags.add(appendFlags);
       if (appendFlagsOrd < 0) {
         // already exists in our hash
         appendFlagsOrd = (-appendFlagsOrd) - 1;
       } else if (appendFlagsOrd > Short.MAX_VALUE) {
         // this limit is probably flexible, but it's a good sanity check too
         throw new UnsupportedOperationException(
             "Too many unique append flags, please report this to dev@lucene.apache.org");
       }

       int dataStart = currentAffix * 4;
       affixData[dataStart + AFFIX_FLAG] = flag;
       affixData[dataStart + AFFIX_STRIP_ORD] = (char) stripOrd.intValue();
       // encode crossProduct into patternIndex
       int patternOrd = patternIndex << 1 | (crossProduct ? 1 : 0);
       affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd;
       affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd;

       if (needsInputCleaning(affixArg)) {
         affixArg = cleanInput(affixArg, sb).toString();
       }

       if (kind == SUFFIX) {
         affixArg = new StringBuilder(affixArg).reverse().toString();
       }

       affixes.computeIfAbsent(affixArg, __ -> new ArrayList<>()).add(currentAffix);
       currentAffix++;
     }
   }

   char affixData(int affixIndex, int offset) {
     return affixData[affixIndex * 4 + offset];
   }

   boolean isCrossProduct(int affix) {
     return (affixData(affix, AFFIX_CONDITION) & 1) == 1;
   }

   int getAffixCondition(int affix) {
     return affixData(affix, AFFIX_CONDITION) >>> 1;
   }

   private ConvTable parseConversions(LineNumberReader reader, int num)
       throws IOException, ParseException {
     TreeMap<String, String> mappings = new TreeMap<>();

     for (int i = 0; i < num; i++) {
       String[] parts = splitBySpace(reader, reader.readLine(), 3);
       if (mappings.put(parts[1], parts[2]) != null) {
         throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
       }
     }

     return new ConvTable(mappings);
   }

   private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};

   /** Parses the encoding and flag format specified in the provided InputStream */
   private void readConfig(InputStream stream, Charset streamCharset)
       throws IOException, ParseException {
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset));
     String line;
     String flagLine = null;
     boolean charsetFound = false;
     boolean flagFound = false;
     while ((line = reader.readLine()) != null) {
       if (line.isBlank()) continue;

       String firstWord = line.split("\\s")[0];
       if ("SET".equals(firstWord)) {
         decoder = getDecoder(singleArgument(reader, line));
         charsetFound = true;
       } else if ("FLAG".equals(firstWord)) {
         // Preserve the flag line for parsing later since we need the decoder's charset
         // and just in case they come out of order.
         flagLine = line;
         flagFound = true;
       } else {
         continue;
       }

       if (charsetFound && flagFound) {
         break;
       }
     }

     if (flagFound) {
       flagParsingStrategy = getFlagParsingStrategy(flagLine, decoder.charset());
     }
   }

   /**
    * Consume the provided byte sequence in full, if present. Otherwise leave the input stream
    * intact.
    *
    * @return {@code true} if the sequence matched and has been consumed.
    */
   @SuppressWarnings("SameParameterValue")
   private static boolean maybeConsume(BufferedInputStream stream, byte[] bytes) throws IOException {
     stream.mark(bytes.length);
     for (byte b : bytes) {
       int nextByte = stream.read();
       if (nextByte != (b & 0xff)) { // covers EOF (-1) as well.
         stream.reset();
         return false;
       }
     }
     return true;
   }

   static final Map<String, String> CHARSET_ALIASES =
       Map.of("microsoft-cp1251", "windows-1251", "TIS620-2533", "TIS-620");

   /**
    * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think
    * ISCII-DEVANAGARI and MICROSOFT-CP1251 etc are allowed...
    *
    * @param encoding Encoding to retrieve the CharsetDecoder for
    * @return CharSetDecoder for the given encoding
    */
   private CharsetDecoder getDecoder(String encoding) {
     if ("ISO8859-14".equals(encoding)) {
       return new ISO8859_14Decoder();
     }
     String canon = CHARSET_ALIASES.get(encoding);
     if (canon != null) {
       encoding = canon;
     }
     return replacingDecoder(Charset.forName(encoding));
   }

   private static CharsetDecoder replacingDecoder(Charset charset) {
     return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
   }

   /**
    * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken
    * from the affix file
    *
    * @param flagLine Line containing the flag information
    * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG
    *     definition
    */
   static FlagParsingStrategy getFlagParsingStrategy(String flagLine, Charset charset) {
     String[] parts = flagLine.split("\\s+");
     if (parts.length != 2) {
       throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine);
     }
     String flagType = parts[1];

     if ("num".equals(flagType)) {
       return new NumFlagParsingStrategy();
     } else if ("UTF-8".equals(flagType)) {
       if (DEFAULT_CHARSET.equals(charset)) {
         return new DefaultAsUtf8FlagParsingStrategy();
       }
       return new SimpleFlagParsingStrategy();
     } else if ("long".equals(flagType)) {
       return new DoubleASCIIFlagParsingStrategy();
     }

     throw new IllegalArgumentException("Unknown flag type: " + flagType);
   }

   private static final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping
   private static final char MORPH_SEPARATOR =
       0x1e; // separator for boundary of entry (may be followed by morph data)

   private String unescapeEntry(String entry) {
     StringBuilder sb = new StringBuilder();
     int end = morphBoundary(entry);
     for (int i = 0; i < end; i++) {
       char ch = entry.charAt(i);
       if (ch == '\\' && i + 1 < entry.length()) {
         sb.append(entry.charAt(i + 1));
         i++;
       } else if (ch == '/' && i > 0) {
         sb.append(FLAG_SEPARATOR);
       } else if (!shouldSkipEscapedChar(ch)) {
         sb.append(ch);
       }
     }
     sb.append(MORPH_SEPARATOR);
     if (end < entry.length()) {
       for (int i = end; i < entry.length(); i++) {
         char c = entry.charAt(i);
         if (!shouldSkipEscapedChar(c)) {
           sb.append(c);
         }
       }
     }
     return sb.toString();
   }

   private static boolean shouldSkipEscapedChar(char ch) {
     return ch == FLAG_SEPARATOR
         || ch == MORPH_SEPARATOR; // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
   }

   private static int morphBoundary(String line) {
     int end = indexOfSpaceOrTab(line, 0);
     if (end == -1) {
       return line.length();
     }
     while (end >= 0 && end < line.length()) {
       if (line.charAt(end) == '\t'
           || end > 0
               && end + 3 < line.length()
               && Character.isLetter(line.charAt(end + 1))
               && Character.isLetter(line.charAt(end + 2))
               && line.charAt(end + 3) == ':') {
         break;
       }
       end = indexOfSpaceOrTab(line, end + 1);
     }
     if (end == -1) {
       return line.length();
     }
     return end;
   }

   static int indexOfSpaceOrTab(String text, int start) {
     int pos1 = text.indexOf('\t', start);
     int pos2 = text.indexOf(' ', start);
     if (pos1 >= 0 && pos2 >= 0) {
       return Math.min(pos1, pos2);
     } else {
       return Math.max(pos1, pos2);
     }
   }

   private int mergeDictionaries(
       List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
       throws IOException {
     StringBuilder sb = new StringBuilder();
     int wordCount = 0;
     try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
       for (InputStream dictionary : dictionaries) {
         BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
         lines.readLine(); // first line is number of entries (approximately, sometimes)

         String line;
         while ((line = lines.readLine()) != null) {
           // wild and unpredictable code comment rules
           if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
             continue;
           }
           line = unescapeEntry(line);
           // if we haven't seen any custom morphological data, try to parse one
           if (!hasCustomMorphData) {
             int morphStart = line.indexOf(MORPH_SEPARATOR);
             if (morphStart >= 0 && morphStart < line.length()) {
               String data = line.substring(morphStart + 1);
               hasCustomMorphData =
                   splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
             }
           }

           wordCount += writeNormalizedWordEntry(sb, writer, line);
         }
       }
       CodecUtil.writeFooter(output);
     }
     return wordCount;
   }

   /** @return the number of word entries written */
   private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
       throws IOException {
     int flagSep = line.indexOf(FLAG_SEPARATOR);
     int morphSep = line.indexOf(MORPH_SEPARATOR);
     assert morphSep > 0;
     assert morphSep > flagSep;
     int sep = flagSep < 0 ? morphSep : flagSep;
     if (sep == 0) return 0;

     CharSequence toWrite;
     String beforeSep = line.substring(0, sep);
     if (needsInputCleaning(beforeSep)) {
       cleanInput(beforeSep, reuse);
       reuse.append(line, sep, line.length());
       toWrite = reuse;
     } else {
       toWrite = line;
     }

     String written = toWrite.toString();
     sep = written.length() - (line.length() - sep);
     writer.write(written.getBytes(StandardCharsets.UTF_8));

     WordCase wordCase = WordCase.caseOf(written, sep);
     if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
       addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
       return 2;
     }
     return 1;
   }

   private void addHiddenCapitalizedWord(
       StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
       throws IOException {
     reuse.setLength(0);
     reuse.append(Character.toUpperCase(word.charAt(0)));
     for (int i = 1; i < word.length(); i++) {
       reuse.append(caseFold(word.charAt(i)));
     }
     reuse.append(FLAG_SEPARATOR);
     reuse.append(HIDDEN_FLAG);
     reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
     writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
   }

   String toLowerCase(String word) {
     char[] chars = new char[word.length()];
     for (int i = 0; i < word.length(); i++) {
       chars[i] = caseFold(word.charAt(i));
     }
     return new String(chars);
   }

   String toTitleCase(String word) {
     char[] chars = new char[word.length()];
     chars[0] = Character.toUpperCase(word.charAt(0));
     for (int i = 1; i < word.length(); i++) {
       chars[i] = caseFold(word.charAt(i));
     }
     return new String(chars);
   }

   private String sortWordsOffline(
       Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
     OfflineSorter sorter =
         new OfflineSorter(
             tempDir,
             tempFileNamePrefix,
             new Comparator<>() {
               final BytesRef scratch1 = new BytesRef();
               final BytesRef scratch2 = new BytesRef();

               private void initScratch(BytesRef o, BytesRef scratch) {
                 scratch.bytes = o.bytes;
                 scratch.offset = o.offset;
                 scratch.length = o.length;

                 for (int i = scratch.length - 1; i >= 0; i--) {
                   if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
                       || scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
                     scratch.length = i;
                     break;
                   }
                 }
               }

               @Override
               public int compare(BytesRef o1, BytesRef o2) {
                 initScratch(o1, scratch1);
                 initScratch(o2, scratch2);

                 int cmp = scratch1.compareTo(scratch2);
                 if (cmp == 0) {
                   // tie break on whole row
                   return o1.compareTo(o2);
                 } else {
                   return cmp;
                 }
               }
             });

     String sorted;
     boolean success = false;
     try {
       sorted = sorter.sort(unsorted.getName());
       success = true;
     } finally {
       if (success) {
         tempDir.deleteFile(unsorted.getName());
       } else {
         IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
       }
     }
     return sorted;
   }

   private FST<IntsRef> readSortedDictionaries(
       Directory tempDir, String sorted, FlagEnumerator flags) throws IOException {
     boolean success = false;

     Map<String, Integer> morphIndices = new HashMap<>();

     EntryGrouper grouper = new EntryGrouper(flags);

     try (ByteSequencesReader reader =
         new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {

       // TODO: the flags themselves can be double-chars (long) or also numeric
       // either way the trick is to encode them as char... but they must be parsed differently

       while (true) {
         BytesRef scratch = reader.next();
         if (scratch == null) {
           break;
         }

         String line = scratch.utf8ToString();
         String entry;
         char[] wordForm;
         int end;

         int flagSep = line.indexOf(FLAG_SEPARATOR);
         if (flagSep == -1) {
           wordForm = NOFLAGS;
           end = line.indexOf(MORPH_SEPARATOR);
           entry = line.substring(0, end);
         } else {
           end = line.indexOf(MORPH_SEPARATOR);
           boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
           String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end);
           if (aliasCount > 0 && !flagPart.isEmpty()) {
             flagPart = getAliasValue(Integer.parseInt(flagPart));
           }

           wordForm = flagParsingStrategy.parseFlags(flagPart);
           if (hidden) {
             wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
             wordForm[wordForm.length - 1] = HIDDEN_FLAG;
           }
           entry = line.substring(0, flagSep);
         }

         int morphDataID = 0;
         if (end + 1 < line.length()) {
           List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
           if (!morphFields.isEmpty()) {
             morphFields.sort(Comparator.naturalOrder());
             morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
           }
         }

         wordHashes.set(Math.abs(entry.hashCode()) % wordHashes.length());
         grouper.add(entry, wordForm, morphDataID);
       }

       // finalize last entry
       grouper.flushGroup();
       success = true;
       return grouper.words.compile();
     } finally {
       if (success) {
         tempDir.deleteFile(sorted);
       } else {
         IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
       }
     }
   }

   private List<String> readMorphFields(String word, String unparsed) {
     List<String> morphFields = null;
     for (String datum : splitMorphData(unparsed)) {
       if (datum.startsWith("ph:")) {
         addPhoneticRepEntries(word, datum.substring(3));
       } else {
         if (morphFields == null) morphFields = new ArrayList<>(1);
         morphFields.add(datum);
       }
     }
     return morphFields == null ? Collections.emptyList() : morphFields;
   }

   private int addMorphFields(Map<String, Integer> indices, String morphFields) {
     Integer alreadyCached = indices.get(morphFields);
     if (alreadyCached != null) {
       return alreadyCached;
     }

     int index = morphData.size();
     indices.put(morphFields, index);
     morphData.add(morphFields);
     return index;
   }

   private void addPhoneticRepEntries(String word, String ph) {
     // e.g. "pretty ph:prity ph:priti->pretti" to suggest both prity->pretty and pritier->prettiest
     int arrow = ph.indexOf("->");
     String pattern;
     String replacement;
     if (arrow > 0) {
       pattern = ph.substring(0, arrow);
       replacement = ph.substring(arrow + 2);
     } else {
       pattern = ph;
       replacement = word;
     }

     // when the ph: field ends with *, strip last character of pattern and replacement
     // e.g., "pretty ph:prity*" results in "prit->prett" replacement instead of "prity->pretty",
     // to get both prity->pretty and pritiest->prettiest suggestions.
     if (pattern.endsWith("*") && pattern.length() > 2 && replacement.length() > 1) {
       pattern = pattern.substring(0, pattern.length() - 2);
       replacement = replacement.substring(0, replacement.length() - 1);
     }

     // capitalize lowercase pattern for capitalized words to support
     // good suggestions also for capitalized misspellings,
     // e.g. Wednesday ph:wendsay results in wendsay -> Wednesday and Wendsay -> Wednesday.
     if (WordCase.caseOf(word) == WordCase.TITLE && WordCase.caseOf(pattern) == WordCase.LOWER) {
       // add also lowercase word in the case of German or
       // Hungarian to support lowercase suggestions lowercased by
       // compound word generation or derivational suffixes
       // for example by adjectival suffix "-i" of geographical names in Hungarian:
       // Massachusetts ph:messzecsuzec
       // messzecsuzeci -> massachusettsi (adjective)
       // For lowercasing by conditional PFX rules, see e.g. germancompounding test
       if (hasLanguage("de", "hu")) {
         repTable.add(new RepEntry(pattern, toLowerCase(replacement)));
       }
       repTable.add(new RepEntry(toTitleCase(pattern), replacement));
     }
     repTable.add(new RepEntry(pattern, replacement));
   }

   boolean isDotICaseChangeDisallowed(char[] word) {
     return word[0] == 'İ' && !alternateCasing;
   }

   private class EntryGrouper {
     final FSTCompiler<IntsRef> words =
         new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
     private final List<char[]> group = new ArrayList<>();
     private final List<Integer> morphDataIDs = new ArrayList<>();
     private final IntsRefBuilder scratchInts = new IntsRefBuilder();
     private String currentEntry = null;
     private final FlagEnumerator flagEnumerator;

     EntryGrouper(FlagEnumerator flagEnumerator) {
       this.flagEnumerator = flagEnumerator;
     }

     void add(String entry, char[] flags, int morphDataID) throws IOException {
       if (!entry.equals(currentEntry)) {
         if (currentEntry != null) {
           if (entry.compareTo(currentEntry) < 0) {
             throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
           }
           flushGroup();
         }
         currentEntry = entry;
       }

       group.add(flags);
       if (hasCustomMorphData) {
         morphDataIDs.add(morphDataID);
       }
     }

     void flushGroup() throws IOException {
       IntsRefBuilder currentOrds = new IntsRefBuilder();

       boolean hasNonHidden = false;
       for (char[] flags : group) {
         if (!hasHiddenFlag(flags)) {
           hasNonHidden = true;
           break;
         }
       }

       for (int i = 0; i < group.size(); i++) {
         char[] flags = group.get(i);
         if (hasNonHidden && hasHiddenFlag(flags)) {
           continue;
         }

         currentOrds.append(flagEnumerator.add(flags));
         if (hasCustomMorphData) {
           currentOrds.append(morphDataIDs.get(i));
         }
       }

       Util.toUTF32(currentEntry, scratchInts);
       words.add(scratchInts.get(), currentOrds.get());

       group.clear();
       morphDataIDs.clear();
     }
   }

   private static boolean hasHiddenFlag(char[] flags) {
     for (char flag : flags) {
       if (flag == HIDDEN_FLAG) {
         return true;
       }
     }
     return false;
   }

   private void parseAlias(String line) {
     String[] ruleArgs = line.split("\\s+");
     if (aliases == null) {
       // first line should be the aliases count
       final int count = Integer.parseInt(ruleArgs[1]);
       aliases = new String[count];
     } else {
       // an alias can map to no flags
       String aliasValue = ruleArgs.length == 1 ? "" : ruleArgs[1];
       aliases[aliasCount++] = aliasValue;
     }
   }

   private String getAliasValue(int id) {
     try {
       return aliases[id - 1];
     } catch (IndexOutOfBoundsException ex) {
       throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
     }
   }

   private void parseMorphAlias(String line) {
     if (morphAliases == null) {
       // first line should be the aliases count
       final int count = Integer.parseInt(line.substring(3));
       morphAliases = new String[count];
     } else {
       String arg = line.substring(2); // leave the space
       morphAliases[morphAliasCount++] = arg;
     }
   }

   private List<String> splitMorphData(String morphData) {
     // first see if it's an alias
     if (morphAliasCount > 0) {
       try {
         int alias = Integer.parseInt(morphData.trim());
         morphData = morphAliases[alias - 1];
       } catch (NumberFormatException ignored) {
       }
     }
     if (morphData.isBlank()) {
       return Collections.emptyList();
     }
     return Arrays.stream(morphData.split("\\s+"))
         .filter(
             s ->
                 s.length() > 3
                     && Character.isLetter(s.charAt(0))
                     && Character.isLetter(s.charAt(1))
                     && s.charAt(2) == ':')
         .collect(Collectors.toList());
   }

   boolean hasFlag(IntsRef forms, char flag) {
     int formStep = formStep();
     for (int i = 0; i < forms.length; i += formStep) {
       if (hasFlag(forms.ints[forms.offset + i], flag)) {
         return true;
       }
     }
     return false;
   }

   /** Abstraction of the process of parsing flags taken from the affix and dic files */
   abstract static class FlagParsingStrategy {
     // we don't check the flag count, as Hunspell accepts longer sequences
     // https://github.com/hunspell/hunspell/issues/707
     static final boolean checkFlags = false;

     /**
      * Parses the given String into a single flag
      *
      * @param rawFlag String to parse into a flag
      * @return Parsed flag
      */
     char parseFlag(String rawFlag) {
       char[] flags = parseFlags(rawFlag);
       if (checkFlags && flags.length != 1) {
         throw new IllegalArgumentException("expected only one flag, got: " + rawFlag);
       }
       return flags[0];
     }

     /**
      * Parses the given String into multiple flags
      *
      * @param rawFlags String to parse into flags
      * @return Parsed flags
      */
     abstract char[] parseFlags(String rawFlags);
   }

   /**
    * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a
    * individual flags. Can be used with both the ASCII and UTF-8 flag types.
    */
   private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
     @Override
     public char[] parseFlags(String rawFlags) {
       return rawFlags.toCharArray();
     }
   }

   /** Used to read flags as UTF-8 even if the rest of the file is in the default (8-bit) encoding */
   private static class DefaultAsUtf8FlagParsingStrategy extends FlagParsingStrategy {
     @Override
     public char[] parseFlags(String rawFlags) {
       return new String(rawFlags.getBytes(DEFAULT_CHARSET), StandardCharsets.UTF_8).toCharArray();
     }
   }

   /**
    * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its
    * numerical form. In the case of multiple flags, each number is separated by a comma.
    */
   private static class NumFlagParsingStrategy extends FlagParsingStrategy {
     @Override
     public char[] parseFlags(String rawFlags) {
       StringBuilder result = new StringBuilder();
       StringBuilder group = new StringBuilder();
       for (int i = 0; i <= rawFlags.length(); i++) {
         if (i == rawFlags.length() || rawFlags.charAt(i) == ',') {
           if (group.length() > 0) { // ignoring empty flags (this happens in danish, for example)
             int flag = Integer.parseInt(group, 0, group.length(), 10);
             if (flag >= DEFAULT_FLAGS) {
               // accept 0 due to https://github.com/hunspell/hunspell/issues/708
               throw new IllegalArgumentException(
                   "Num flags should be between 0 and " + DEFAULT_FLAGS + ", found " + flag);
             }
             result.append((char) flag);
             group.setLength(0);
           }
         } else if (rawFlags.charAt(i) >= '0' && rawFlags.charAt(i) <= '9') {
           group.append(rawFlags.charAt(i));
         }
       }

       return result.toString().toCharArray();
     }
   }

   /**
    * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII
    * characters whose codes must be combined into a single character.
    */
   private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {

     @Override
     public char[] parseFlags(String rawFlags) {
       if (checkFlags && rawFlags.length() % 2 == 1) {
         throw new IllegalArgumentException(
             "Invalid flags (should be even number of characters): " + rawFlags);
       }

       char[] flags = new char[rawFlags.length() / 2];
       for (int i = 0; i < flags.length; i++) {
         char f1 = rawFlags.charAt(i * 2);
         char f2 = rawFlags.charAt(i * 2 + 1);
         if (f1 >= 256 || f2 >= 256) {
           throw new IllegalArgumentException(
               "Invalid flags (LONG flags must be double ASCII): " + rawFlags);
         }
         flags[i] = (char) (f1 << 8 | f2);
       }
       return flags;
     }
   }

   boolean hasFlag(int entryId, char flag) {
     return flagLookup.hasFlag(entryId, flag);
   }

   boolean mayNeedInputCleaning() {
     return ignoreCase || ignore != null || iconv != null;
   }

   boolean needsInputCleaning(CharSequence input) {
     if (mayNeedInputCleaning()) {
       for (int i = 0; i < input.length(); i++) {
         char ch = input.charAt(i);
         if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0
             || ignoreCase && caseFold(ch) != ch
             || iconv != null && iconv.mightReplaceChar(ch)) {
           return true;
         }
       }
     }
     return false;
   }

   CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
     reuse.setLength(0);

     for (int i = 0; i < input.length(); i++) {
       char ch = input.charAt(i);

       if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
         continue;
       }

       if (ignoreCase && iconv == null) {
         // if we have no input conversion mappings, do this on-the-fly
         ch = caseFold(ch);
       }

       reuse.append(ch);
     }

     if (iconv != null) {
       iconv.applyMappings(reuse);
       if (ignoreCase) {
         for (int i = 0; i < reuse.length(); i++) {
           reuse.setCharAt(i, caseFold(reuse.charAt(i)));
         }
       }
     }

     return reuse;
   }

   private static char[] toSortedCharArray(Set<Character> set) {
     char[] chars = new char[set.size()];
     int i = 0;
     for (Character c : set) {
       chars[i++] = c;
     }
     Arrays.sort(chars);
     return chars;
   }

   boolean isSecondStagePrefix(char flag) {
     return Arrays.binarySearch(secondStagePrefixFlags, flag) >= 0;
   }

   boolean isSecondStageSuffix(char flag) {
     return Arrays.binarySearch(secondStageSuffixFlags, flag) >= 0;
   }

   /** folds single character (according to LANG if present) */
   char caseFold(char c) {
     if (alternateCasing) {
       if (c == 'I') {
         return 'ı';
       } else if (c == 'İ') {
         return 'i';
       } else {
         return Character.toLowerCase(c);
       }
     } else {
       return Character.toLowerCase(c);
     }
   }

   /** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
   public boolean getIgnoreCase() {
     return ignoreCase;
   }

   /**
    * Returns the default temporary directory pointed to by {@code java.io.tmpdir}. If not accessible
    * or not available, an IOException is thrown.
    */
   static Path getDefaultTempDir() throws IOException {
     String tmpDir = System.getProperty("java.io.tmpdir");
     if (tmpDir == null) {
       throw new IOException("No temporary path (java.io.tmpdir)?");
     }
     Path tmpPath = Paths.get(tmpDir);
     if (!Files.isWritable(tmpPath)) {
       throw new IOException(
           "Temporary path not present or writeable?: " + tmpPath.toAbsolutePath());
     }
     return tmpPath;
   }

   /** Possible word breaks according to BREAK directives */
   static class Breaks {
     private static final Set<String> MINUS = Collections.singleton("-");
     static final Breaks DEFAULT = new Breaks(MINUS, MINUS, MINUS);
     final String[] starting, ending, middle;

     Breaks(Collection<String> starting, Collection<String> ending, Collection<String> middle) {
       this.starting = starting.toArray(new String[0]);
       this.ending = ending.toArray(new String[0]);
       this.middle = middle.toArray(new String[0]);
     }

     boolean isNotEmpty() {
       return middle.length > 0 || starting.length > 0 || ending.length > 0;
     }
   }
 }