lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.hunspell;


 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.OutputStream;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.TreeMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
 import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;

 /**
  * In-memory structure for the dictionary (.dic) and affix (.aff)
  * data of a hunspell dictionary.
  */
 public class Dictionary {

   static final char[] NOFLAGS = new char[0];

   private static final String ALIAS_KEY = "AF";
   private static final String MORPH_ALIAS_KEY = "AM";
   private static final String PREFIX_KEY = "PFX";
   private static final String SUFFIX_KEY = "SFX";
   private static final String FLAG_KEY = "FLAG";
   private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
   private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
   private static final String IGNORE_KEY = "IGNORE";
   private static final String ICONV_KEY = "ICONV";
   private static final String OCONV_KEY = "OCONV";
   private static final String FULLSTRIP_KEY = "FULLSTRIP";
   private static final String LANG_KEY = "LANG";
   private static final String KEEPCASE_KEY = "KEEPCASE";
   private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
   private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
   private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";

   private static final String NUM_FLAG_TYPE = "num";
   private static final String UTF8_FLAG_TYPE = "UTF-8";
   private static final String LONG_FLAG_TYPE = "long";

   // TODO: really for suffixes we should reverse the automaton and run them backwards
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

   FST<IntsRef> prefixes;
   FST<IntsRef> suffixes;

   // all condition checks used by prefixes and suffixes. these are typically re-used across
   // many affix stripping rules. so these are deduplicated, to save RAM.
   ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();

   // the entries in the .dic file, mapping to their set of flags.
   // the fst output is the ordinal list for flagLookup
   FST<IntsRef> words;
   // the list of unique flagsets (wordforms). theoretically huge, but practically
   // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
   BytesRefHash flagLookup = new BytesRefHash();

   // the list of unique strip affixes.
   char[] stripData;
   int[] stripOffsets;

   // 8 bytes per affix
   byte[] affixData = new byte[64];
   private int currentAffix = 0;

   private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy

   // AF entries
   private String[] aliases;
   private int aliasCount = 0;

   // AM entries
   private String[] morphAliases;
   private int morphAliasCount = 0;

   // st: morphological entries (either directly, or aliased from AM)
   private String[] stemExceptions = new String[8];
   private int stemExceptionCount = 0;
   // we set this during sorting, so we know to add an extra FST output.
   // when set, some words have exceptional stems, and the last entry is a pointer to stemExceptions
   boolean hasStemExceptions;

   private final Path tempPath = getDefaultTempDir(); // TODO: make this configurable?

   boolean ignoreCase;
   boolean complexPrefixes;
   boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping

   int circumfix = -1; // circumfix flag, or -1 if one is not defined
   int keepcase = -1;  // keepcase flag, or -1 if one is not defined
   int needaffix = -1; // needaffix flag, or -1 if one is not defined
   int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined

   // ignored characters (dictionary, affix, inputs)
   private char[] ignore;

   // FSTs used for ICONV/OCONV, output ord pointing to replacement text
   FST<CharsRef> iconv;
   FST<CharsRef> oconv;

   boolean needsInputCleaning;
   boolean needsOutputCleaning;

   // true if we can strip suffixes "down to nothing"
   boolean fullStrip;

   // language declaration of the dictionary
   String language;
   // true if case algorithms should use alternate (Turkish/Azeri) mapping
   boolean alternateCasing;

   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
    * You have to close the provided InputStreams yourself.
    *
    * @param tempDir Directory to use for offline sorting
    * @param tempFileNamePrefix prefix to use to generate temp file names
    * @param affix InputStream for reading the hunspell affix file (won't be closed).
    * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
    * @throws IOException Can be thrown while reading from the InputStreams
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public Dictionary(Directory tempDir, String tempFileNamePrefix, InputStream affix, InputStream dictionary) throws IOException, ParseException {
     this(tempDir, tempFileNamePrefix, affix, Collections.singletonList(dictionary), false);
   }

   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
    * You have to close the provided InputStreams yourself.
    *
    * @param tempDir Directory to use for offline sorting
    * @param tempFileNamePrefix prefix to use to generate temp file names
    * @param affix InputStream for reading the hunspell affix file (won't be closed).
    * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
    * @throws IOException Can be thrown while reading from the InputStreams
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public Dictionary(Directory tempDir, String tempFileNamePrefix, InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException {
     this.ignoreCase = ignoreCase;
     this.needsInputCleaning = ignoreCase;
     this.needsOutputCleaning = false; // set if we have an OCONV
     flagLookup.add(new BytesRef()); // no flags -> ord 0

     Path aff = Files.createTempFile(tempPath, "affix", "aff");
     OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
     InputStream aff1 = null;
     InputStream aff2 = null;
     boolean success = false;
     try {
       // copy contents of affix stream to temp file
       final byte [] buffer = new byte [1024 * 8];
       int len;
       while ((len = affix.read(buffer)) > 0) {
         out.write(buffer, 0, len);
       }
       out.close();

       // pass 1: get encoding
       aff1 = new BufferedInputStream(Files.newInputStream(aff));
       String encoding = getDictionaryEncoding(aff1);

       // pass 2: parse affixes
       CharsetDecoder decoder = getJavaEncoding(encoding);
       aff2 = new BufferedInputStream(Files.newInputStream(aff));
       readAffixFile(aff2, decoder);

       // read dictionary entries
       IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
       Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
       readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b);
       words = b.finish();
       aliases = null; // no longer needed
       morphAliases = null; // no longer needed
       success = true;
     } finally {
       IOUtils.closeWhileHandlingException(out, aff1, aff2);
       if (success) {
         Files.delete(aff);
       } else {
         IOUtils.deleteFilesIgnoringExceptions(aff);
       }
     }
   }

   /**
    * Looks up Hunspell word forms from the dictionary
    */
   IntsRef lookupWord(char word[], int offset, int length) {
     return lookup(words, word, offset, length);
   }

   // only for testing
   IntsRef lookupPrefix(char word[], int offset, int length) {
     return lookup(prefixes, word, offset, length);
   }

   // only for testing
   IntsRef lookupSuffix(char word[], int offset, int length) {
     return lookup(suffixes, word, offset, length);
   }

   IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) {
     if (fst == null) {
       return null;
     }
     final FST.BytesReader bytesReader = fst.getBytesReader();
     final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
     // Accumulate output as we go
     final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
     IntsRef output = NO_OUTPUT;

     int l = offset + length;
     try {
       for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
         cp = Character.codePointAt(word, i, l);
         if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
           return null;
         } else if (arc.output != NO_OUTPUT) {
           output = fst.outputs.add(output, arc.output);
         }
       }
       if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
         return null;
       } else if (arc.output != NO_OUTPUT) {
         return fst.outputs.add(output, arc.output);
       } else {
         return output;
       }
     } catch (IOException bogus) {
       throw new RuntimeException(bogus);
     }
   }

   /**
    * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
    *
    * @param affixStream InputStream to read the content of the affix file from
    * @param decoder CharsetDecoder to decode the content of the file
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
     TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
     TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
     Map<String,Integer> seenPatterns = new HashMap<>();

     // zero condition -> 0 ord
     seenPatterns.put(".*", 0);
     patterns.add(null);

     // zero strip -> 0 ord
     Map<String,Integer> seenStrips = new LinkedHashMap<>();
     seenStrips.put("", 0);

     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
     String line = null;
     while ((line = reader.readLine()) != null) {
       // ignore any BOM marker on first line
       if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
         line = line.substring(1);
       }
       if (line.startsWith(ALIAS_KEY)) {
         parseAlias(line);
       } else if (line.startsWith(MORPH_ALIAS_KEY)) {
         parseMorphAlias(line);
       } else if (line.startsWith(PREFIX_KEY)) {
         parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
       } else if (line.startsWith(SUFFIX_KEY)) {
         parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
       } else if (line.startsWith(FLAG_KEY)) {
         // Assume that the FLAG line comes before any prefix or suffixes
         // Store the strategy so it can be used when parsing the dic file
         flagParsingStrategy = getFlagParsingStrategy(line);
       } else if (line.equals(COMPLEXPREFIXES_KEY)) {
         complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
       } else if (line.startsWith(CIRCUMFIX_KEY)) {
         String parts[] = line.split("\\s+");
         if (parts.length != 2) {
           throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
         }
         circumfix = flagParsingStrategy.parseFlag(parts[1]);
       } else if (line.startsWith(KEEPCASE_KEY)) {
         String parts[] = line.split("\\s+");
         if (parts.length != 2) {
           throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
         }
         keepcase = flagParsingStrategy.parseFlag(parts[1]);
       } else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
         String parts[] = line.split("\\s+");
         if (parts.length != 2) {
           throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
         }
         needaffix = flagParsingStrategy.parseFlag(parts[1]);
       } else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
         String parts[] = line.split("\\s+");
         if (parts.length != 2) {
           throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
         }
         onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
       } else if (line.startsWith(IGNORE_KEY)) {
         String parts[] = line.split("\\s+");
         if (parts.length != 2) {
           throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
         }
         ignore = parts[1].toCharArray();
         Arrays.sort(ignore);
         needsInputCleaning = true;
       } else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
         String parts[] = line.split("\\s+");
         String type = parts[0];
         if (parts.length != 2) {
           throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
         }
         int num = Integer.parseInt(parts[1]);
         FST<CharsRef> res = parseConversions(reader, num);
         if (type.equals("ICONV")) {
           iconv = res;
           needsInputCleaning |= iconv != null;
         } else {
           oconv = res;
           needsOutputCleaning |= oconv != null;
         }
       } else if (line.startsWith(FULLSTRIP_KEY)) {
         fullStrip = true;
       } else if (line.startsWith(LANG_KEY)) {
         language = line.substring(LANG_KEY.length()).trim();
         alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
       }
     }

     this.prefixes = affixFST(prefixes);
     this.suffixes = affixFST(suffixes);

     int totalChars = 0;
     for (String strip : seenStrips.keySet()) {
       totalChars += strip.length();
     }
     stripData = new char[totalChars];
     stripOffsets = new int[seenStrips.size()+1];
     int currentOffset = 0;
     int currentIndex = 0;
     for (String strip : seenStrips.keySet()) {
       stripOffsets[currentIndex++] = currentOffset;
       strip.getChars(0, strip.length(), stripData, currentOffset);
       currentOffset += strip.length();
     }
     assert currentIndex == seenStrips.size();
     stripOffsets[currentIndex] = currentOffset;
   }

   private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
     IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
     Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
     IntsRefBuilder scratch = new IntsRefBuilder();
     for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
       Util.toUTF32(entry.getKey(), scratch);
       List<Integer> entries = entry.getValue();
       IntsRef output = new IntsRef(entries.size());
       for (Integer c : entries) {
         output.ints[output.length++] = c;
       }
       builder.add(scratch.get(), output);
     }
     return builder.finish();
   }

   static String escapeDash(String re) {
     // we have to be careful, even though dash doesn't have a special meaning,
     // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
     StringBuilder escaped = new StringBuilder();
     for (int i = 0; i < re.length(); i++) {
       char c = re.charAt(i);
       if (c == '-') {
         escaped.append("\\-");
       } else {
         escaped.append(c);
         if (c == '\\' && i + 1 < re.length()) {
           escaped.append(re.charAt(i+1));
           i++;
         }
       }
     }
     return escaped.toString();
   }

   /**
    * Parses a specific affix rule putting the result into the provided affix map
    *
    * @param affixes Map where the result of the parsing will be put
    * @param header Header line of the affix rule
    * @param reader BufferedReader to read the content of the rule from
    * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
    *                         pattern
    * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
   private void parseAffix(TreeMap<String,List<Integer>> affixes,
                           String header,
                           LineNumberReader reader,
                           String conditionPattern,
                           Map<String,Integer> seenPatterns,
                           Map<String,Integer> seenStrips) throws IOException, ParseException {

     BytesRefBuilder scratch = new BytesRefBuilder();
     StringBuilder sb = new StringBuilder();
     String args[] = header.split("\\s+");

     boolean crossProduct = args[2].equals("Y");
     boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

     int numLines = Integer.parseInt(args[3]);
     affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
     ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

     for (int i = 0; i < numLines; i++) {
       assert affixWriter.getPosition() == currentAffix << 3;
       String line = reader.readLine();
       String ruleArgs[] = line.split("\\s+");

       // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
       // condition is optional
       if (ruleArgs.length < 4) {
           throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber());
       }

       char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
       String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
       String affixArg = ruleArgs[3];
       char appendFlags[] = null;

       // first: parse continuation classes out of affix
       int flagSep = affixArg.lastIndexOf('/');
       if (flagSep != -1) {
         String flagPart = affixArg.substring(flagSep + 1);
         affixArg = affixArg.substring(0, flagSep);

         if (aliasCount > 0) {
           flagPart = getAliasValue(Integer.parseInt(flagPart));
         }

         appendFlags = flagParsingStrategy.parseFlags(flagPart);
         Arrays.sort(appendFlags);
         twoStageAffix = true;
       }
       // zero affix -> empty string
       if ("0".equals(affixArg)) {
         affixArg = "";
       }

       String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
       // at least the gascon affix file has this issue
       if (condition.startsWith("[") && condition.indexOf(']') == -1) {
         condition = condition + "]";
       }
       // "dash hasn't got special meaning" (we must escape it)
       if (condition.indexOf('-') >= 0) {
         condition = escapeDash(condition);
       }

       final String regex;
       if (".".equals(condition)) {
         regex = ".*"; // Zero condition is indicated by dot
       } else if (condition.equals(strip)) {
         regex = ".*"; // TODO: optimize this better:
                       // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                       // but this is complicated...
       } else {
         regex = String.format(Locale.ROOT, conditionPattern, condition);
       }

       // deduplicate patterns
       Integer patternIndex = seenPatterns.get(regex);
       if (patternIndex == null) {
         patternIndex = patterns.size();
         if (patternIndex > Short.MAX_VALUE) {
           throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");
         }
         seenPatterns.put(regex, patternIndex);
         CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
         patterns.add(pattern);
       }

       Integer stripOrd = seenStrips.get(strip);
       if (stripOrd == null) {
         stripOrd = seenStrips.size();
         seenStrips.put(strip, stripOrd);
         if (stripOrd > Character.MAX_VALUE) {
           throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
         }
       }

       if (appendFlags == null) {
         appendFlags = NOFLAGS;
       }

       encodeFlags(scratch, appendFlags);
       int appendFlagsOrd = flagLookup.add(scratch.get());
       if (appendFlagsOrd < 0) {
         // already exists in our hash
         appendFlagsOrd = (-appendFlagsOrd)-1;
       } else if (appendFlagsOrd > Short.MAX_VALUE) {
         // this limit is probably flexible, but it's a good sanity check too
         throw new UnsupportedOperationException("Too many unique append flags, please report this to dev@lucene.apache.org");
       }

       affixWriter.writeShort((short)flag);
       affixWriter.writeShort((short)stripOrd.intValue());
       // encode crossProduct into patternIndex
       int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
       affixWriter.writeShort((short)patternOrd);
       affixWriter.writeShort((short)appendFlagsOrd);

       if (needsInputCleaning) {
         CharSequence cleaned = cleanInput(affixArg, sb);
         affixArg = cleaned.toString();
       }

       if (isSuffix) {
         affixArg = new StringBuilder(affixArg).reverse().toString();
       }

       List<Integer> list = affixes.get(affixArg);
       if (list == null) {
         list = new ArrayList<>();
         affixes.put(affixArg, list);
       }
       list.add(currentAffix);
       currentAffix++;
     }
   }

   private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
     Map<String,String> mappings = new TreeMap<>();

     for (int i = 0; i < num; i++) {
       String line = reader.readLine();
       String parts[] = line.split("\\s+");
       if (parts.length != 3) {
         throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
       }
       if (mappings.put(parts[1], parts[2]) != null) {
         throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
       }
     }

     Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
     Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
     IntsRefBuilder scratchInts = new IntsRefBuilder();
     for (Map.Entry<String,String> entry : mappings.entrySet()) {
       Util.toUTF16(entry.getKey(), scratchInts);
       builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
     }

     return builder.finish();
   }

   /** pattern accepts optional BOM + SET + any whitespace */
   final static Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+");

   /**
    * Parses the encoding specified in the affix file readable through the provided InputStream
    *
    * @param affix InputStream for reading the affix file
    * @return Encoding specified in the affix file
    * @throws IOException Can be thrown while reading from the InputStream
    * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
    */
   static String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
     final StringBuilder encoding = new StringBuilder();
     for (;;) {
       encoding.setLength(0);
       int ch;
       while ((ch = affix.read()) >= 0) {
         if (ch == '\n') {
           break;
         }
         if (ch != '\r') {
           encoding.append((char)ch);
         }
       }
       if (
           encoding.length() == 0 || encoding.charAt(0) == '#' ||
           // this test only at the end as ineffective but would allow lines only containing spaces:
           encoding.toString().trim().length() == 0
       ) {
         if (ch < 0) {
           throw new ParseException("Unexpected end of affix file.", 0);
         }
         continue;
       }
       Matcher matcher = ENCODING_PATTERN.matcher(encoding);
       if (matcher.find()) {
         int last = matcher.end();
         return encoding.substring(last).trim();
       }
     }
   }

   static final Map<String,String> CHARSET_ALIASES;
   static {
     Map<String,String> m = new HashMap<>();
     m.put("microsoft-cp1251", "windows-1251");
     m.put("TIS620-2533", "TIS-620");
     CHARSET_ALIASES = Collections.unmodifiableMap(m);
   }

   /**
    * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
    * MICROSOFT-CP1251 etc are allowed...
    *
    * @param encoding Encoding to retrieve the CharsetDecoder for
    * @return CharSetDecoder for the given encoding
    */
   private CharsetDecoder getJavaEncoding(String encoding) {
     if ("ISO8859-14".equals(encoding)) {
       return new ISO8859_14Decoder();
     }
     String canon = CHARSET_ALIASES.get(encoding);
     if (canon != null) {
       encoding = canon;
     }
     Charset charset = Charset.forName(encoding);
     return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
   }

   /**
    * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
    *
    * @param flagLine Line containing the flag information
    * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
    */
   static FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
     String parts[] = flagLine.split("\\s+");
     if (parts.length != 2) {
       throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine);
     }
     String flagType = parts[1];

     if (NUM_FLAG_TYPE.equals(flagType)) {
       return new NumFlagParsingStrategy();
     } else if (UTF8_FLAG_TYPE.equals(flagType)) {
       return new SimpleFlagParsingStrategy();
     } else if (LONG_FLAG_TYPE.equals(flagType)) {
       return new DoubleASCIIFlagParsingStrategy();
     }

     throw new IllegalArgumentException("Unknown flag type: " + flagType);
   }

   final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping
   final char MORPH_SEPARATOR = 0x1e; // separator for boundary of entry (may be followed by morph data)

   String unescapeEntry(String entry) {
     StringBuilder sb = new StringBuilder();
     int end = morphBoundary(entry);
     for (int i = 0; i < end; i++) {
       char ch = entry.charAt(i);
       if (ch == '\\' && i+1 < entry.length()) {
         sb.append(entry.charAt(i+1));
         i++;
       } else if (ch == '/') {
         sb.append(FLAG_SEPARATOR);
       } else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) {
         // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
       } else {
         sb.append(ch);
       }
     }
     sb.append(MORPH_SEPARATOR);
     if (end < entry.length()) {
       for (int i = end; i < entry.length(); i++) {
         char c = entry.charAt(i);
         if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) {
           // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
         } else {
           sb.append(c);
         }
       }
     }
     return sb.toString();
   }

   static int morphBoundary(String line) {
     int end = indexOfSpaceOrTab(line, 0);
     if (end == -1) {
       return line.length();
     }
     while (end >= 0 && end < line.length()) {
       if (line.charAt(end) == '\t' ||
           end+3 < line.length() &&
           Character.isLetter(line.charAt(end+1)) &&
           Character.isLetter(line.charAt(end+2)) &&
           line.charAt(end+3) == ':') {
         break;
       }
       end = indexOfSpaceOrTab(line, end+1);
     }
     if (end == -1) {
       return line.length();
     }
     return end;
   }

   static int indexOfSpaceOrTab(String text, int start) {
     int pos1 = text.indexOf('\t', start);
     int pos2 = text.indexOf(' ', start);
     if (pos1 >= 0 && pos2 >= 0) {
       return Math.min(pos1, pos2);
     } else {
       return Math.max(pos1, pos2);
     }
   }

   /**
    * Reads the dictionary file through the provided InputStreams, building up the words map
    *
    * @param dictionaries InputStreams to read the dictionary file through
    * @param decoder CharsetDecoder used to decode the contents of the file
    * @throws IOException Can be thrown while reading from the file
    */
   private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
     BytesRefBuilder flagsScratch = new BytesRefBuilder();
     IntsRefBuilder scratchInts = new IntsRefBuilder();

     StringBuilder sb = new StringBuilder();

     IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
     try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
       for (InputStream dictionary : dictionaries) {
         BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
         String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

         while ((line = lines.readLine()) != null) {
           // wild and unpredictable code comment rules
           if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
             continue;
           }
           line = unescapeEntry(line);
           // if we havent seen any stem exceptions, try to parse one
           if (hasStemExceptions == false) {
             int morphStart = line.indexOf(MORPH_SEPARATOR);
             if (morphStart >= 0 && morphStart < line.length()) {
               hasStemExceptions = parseStemException(line.substring(morphStart+1)) != null;
             }
           }
           if (needsInputCleaning) {
             int flagSep = line.indexOf(FLAG_SEPARATOR);
             if (flagSep == -1) {
               flagSep = line.indexOf(MORPH_SEPARATOR);
             }
             if (flagSep == -1) {
               CharSequence cleansed = cleanInput(line, sb);
               writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
             } else {
               String text = line.substring(0, flagSep);
               CharSequence cleansed = cleanInput(text, sb);
               if (cleansed != sb) {
                 sb.setLength(0);
                 sb.append(cleansed);
               }
               sb.append(line.substring(flagSep));
               writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
             }
           } else {
             writer.write(line.getBytes(StandardCharsets.UTF_8));
           }
         }
       }
       CodecUtil.writeFooter(unsorted);
     }

     OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {
       BytesRef scratch1 = new BytesRef();
       BytesRef scratch2 = new BytesRef();

       @Override
       public int compare(BytesRef o1, BytesRef o2) {
         scratch1.bytes = o1.bytes;
         scratch1.offset = o1.offset;
         scratch1.length = o1.length;

         for (int i = scratch1.length - 1; i >= 0; i--) {
           if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
             scratch1.length = i;
             break;
           }
         }

         scratch2.bytes = o2.bytes;
         scratch2.offset = o2.offset;
         scratch2.length = o2.length;

         for (int i = scratch2.length - 1; i >= 0; i--) {
           if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
             scratch2.length = i;
             break;
           }
         }

         int cmp = scratch1.compareTo(scratch2);
         if (cmp == 0) {
           // tie break on whole row
           return o1.compareTo(o2);
         } else {
           return cmp;
         }
       }
     });

     String sorted;
     boolean success = false;
     try {
       sorted = sorter.sort(unsorted.getName());
       success = true;
     } finally {
       if (success) {
         tempDir.deleteFile(unsorted.getName());
       } else {
         IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
       }
     }

     boolean success2 = false;

     try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {

       // TODO: the flags themselves can be double-chars (long) or also numeric
       // either way the trick is to encode them as char... but they must be parsed differently

       String currentEntry = null;
       IntsRefBuilder currentOrds = new IntsRefBuilder();

       while (true) {
         BytesRef scratch = reader.next();
         if (scratch == null) {
           break;
         }

         String line = scratch.utf8ToString();
         String entry;
         char wordForm[];
         int end;

         int flagSep = line.indexOf(FLAG_SEPARATOR);
         if (flagSep == -1) {
           wordForm = NOFLAGS;
           end = line.indexOf(MORPH_SEPARATOR);
           entry = line.substring(0, end);
         } else {
           end = line.indexOf(MORPH_SEPARATOR);
           String flagPart = line.substring(flagSep + 1, end);
           if (aliasCount > 0) {
             flagPart = getAliasValue(Integer.parseInt(flagPart));
           }

           wordForm = flagParsingStrategy.parseFlags(flagPart);
           Arrays.sort(wordForm);
           entry = line.substring(0, flagSep);
         }
         // we possibly have morphological data
         int stemExceptionID = 0;
         if (hasStemExceptions && end+1 < line.length()) {
           String stemException = parseStemException(line.substring(end+1));
           if (stemException != null) {
             stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount+1);
             stemExceptionID = stemExceptionCount+1; // we use '0' to indicate no exception for the form
             stemExceptions[stemExceptionCount++] = stemException;
           }
         }

         int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
         if (cmp < 0) {
           throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
         } else {
           encodeFlags(flagsScratch, wordForm);
           int ord = flagLookup.add(flagsScratch.get());
           if (ord < 0) {
             // already exists in our hash
             ord = (-ord)-1;
           }
           // finalize current entry, and switch "current" if necessary
           if (cmp > 0 && currentEntry != null) {
             Util.toUTF32(currentEntry, scratchInts);
             words.add(scratchInts.get(), currentOrds.get());
           }
           // swap current
           if (cmp > 0 || currentEntry == null) {
             currentEntry = entry;
             currentOrds = new IntsRefBuilder(); // must be this way
           }
           if (hasStemExceptions) {
             currentOrds.append(ord);
             currentOrds.append(stemExceptionID);
           } else {
             currentOrds.append(ord);
           }
         }
       }

       // finalize last entry
       Util.toUTF32(currentEntry, scratchInts);
       words.add(scratchInts.get(), currentOrds.get());
       success2 = true;
     } finally {
       if (success2) {
         tempDir.deleteFile(sorted);
       } else {
         IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
       }
     }
   }

   static char[] decodeFlags(BytesRef b) {
     if (b.length == 0) {
       return CharsRef.EMPTY_CHARS;
     }
     int len = b.length >>> 1;
     char flags[] = new char[len];
     int upto = 0;
     int end = b.offset + b.length;
     for (int i = b.offset; i < end; i += 2) {
       flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i+1] & 0xff));
     }
     return flags;
   }

   static void encodeFlags(BytesRefBuilder b, char flags[]) {
     int len = flags.length << 1;
     b.grow(len);
     b.clear();
     for (int i = 0; i < flags.length; i++) {
       int flag = flags[i];
       b.append((byte) ((flag >> 8) & 0xff));
       b.append((byte) (flag & 0xff));
     }
   }

   private void parseAlias(String line) {
     String ruleArgs[] = line.split("\\s+");
     if (aliases == null) {
       //first line should be the aliases count
       final int count = Integer.parseInt(ruleArgs[1]);
       aliases = new String[count];
     } else {
       // an alias can map to no flags
       String aliasValue = ruleArgs.length == 1 ? "" : ruleArgs[1];
       aliases[aliasCount++] = aliasValue;
     }
   }

   private String getAliasValue(int id) {
     try {
       return aliases[id - 1];
     } catch (IndexOutOfBoundsException ex) {
       throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
     }
   }

   String getStemException(int id) {
     return stemExceptions[id-1];
   }

   private void parseMorphAlias(String line) {
     if (morphAliases == null) {
       //first line should be the aliases count
       final int count = Integer.parseInt(line.substring(3));
       morphAliases = new String[count];
     } else {
       String arg = line.substring(2); // leave the space
       morphAliases[morphAliasCount++] = arg;
     }
   }

   private String parseStemException(String morphData) {
     // first see if it's an alias
     if (morphAliasCount > 0) {
       try {
         int alias = Integer.parseInt(morphData.trim());
         morphData = morphAliases[alias-1];
       } catch (NumberFormatException e) {
         // fine
       }
     }
     // try to parse morph entry
     int index = morphData.indexOf(" st:");
     if (index < 0) {
       index = morphData.indexOf("\tst:");
     }
     if (index >= 0) {
       int endIndex = indexOfSpaceOrTab(morphData, index+1);
       if (endIndex < 0) {
         endIndex = morphData.length();
       }
       return morphData.substring(index+4, endIndex);
     }
     return null;
   }

   /**
    * Abstraction of the process of parsing flags taken from the affix and dic files
    */
   static abstract class FlagParsingStrategy {

     /**
      * Parses the given String into a single flag
      *
      * @param rawFlag String to parse into a flag
      * @return Parsed flag
      */
     char parseFlag(String rawFlag) {
       char flags[] = parseFlags(rawFlag);
       if (flags.length != 1) {
         throw new IllegalArgumentException("expected only one flag, got: " + rawFlag);
       }
       return flags[0];
     }

     /**
      * Parses the given String into multiple flags
      *
      * @param rawFlags String to parse into flags
      * @return Parsed flags
      */
     abstract char[] parseFlags(String rawFlags);
   }

   /**
    * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
    * Can be used with both the ASCII and UTF-8 flag types.
    */
   private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
     @Override
     public char[] parseFlags(String rawFlags) {
       return rawFlags.toCharArray();
     }
   }

   /**
    * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form.  In the case
    * of multiple flags, each number is separated by a comma.
    */
   private static class NumFlagParsingStrategy extends FlagParsingStrategy {
     @Override
     public char[] parseFlags(String rawFlags) {
       String[] rawFlagParts = rawFlags.trim().split(",");
       char[] flags = new char[rawFlagParts.length];
       int upto = 0;

       for (int i = 0; i < rawFlagParts.length; i++) {
         // note, removing the trailing X/leading I for nepali... what is the rule here?!
         String replacement = rawFlagParts[i].replaceAll("[^0-9]", "");
         // note, ignoring empty flags (this happens in danish, for example)
         if (replacement.isEmpty()) {
           continue;
         }
         flags[upto++] = (char) Integer.parseInt(replacement);
       }

       if (upto < flags.length) {
         flags = ArrayUtil.copyOfSubArray(flags, 0, upto);
       }
       return flags;
     }
   }

   /**
    * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
    * must be combined into a single character.
    */
   private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {

     @Override
     public char[] parseFlags(String rawFlags) {
       if (rawFlags.length() == 0) {
         return new char[0];
       }

       StringBuilder builder = new StringBuilder();
       if (rawFlags.length() % 2 == 1) {
         throw new IllegalArgumentException("Invalid flags (should be even number of characters): " + rawFlags);
       }
       for (int i = 0; i < rawFlags.length(); i+=2) {
         char f1 = rawFlags.charAt(i);
         char f2 = rawFlags.charAt(i+1);
         if (f1 >= 256 || f2 >= 256) {
           throw new IllegalArgumentException("Invalid flags (LONG flags must be double ASCII): " + rawFlags);
         }
         char combined = (char) (f1 << 8 | f2);
         builder.append(combined);
       }

       char flags[] = new char[builder.length()];
       builder.getChars(0, builder.length(), flags, 0);
       return flags;
     }
   }

   static boolean hasFlag(char flags[], char flag) {
     return Arrays.binarySearch(flags, flag) >= 0;
   }

   CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
     reuse.setLength(0);

     for (int i = 0; i < input.length(); i++) {
       char ch = input.charAt(i);

       if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
         continue;
       }

       if (ignoreCase && iconv == null) {
         // if we have no input conversion mappings, do this on-the-fly
         ch = caseFold(ch);
       }

       reuse.append(ch);
     }

     if (iconv != null) {
       try {
         applyMappings(iconv, reuse);
       } catch (IOException bogus) {
         throw new RuntimeException(bogus);
       }
       if (ignoreCase) {
         for (int i = 0; i < reuse.length(); i++) {
           reuse.setCharAt(i, caseFold(reuse.charAt(i)));
         }
       }
     }

     return reuse;
   }

   /** folds single character (according to LANG if present) */
   char caseFold(char c) {
     if (alternateCasing) {
       if (c == 'I') {
         return 'ı';
       } else if (c == 'İ') {
         return 'i';
       } else {
         return Character.toLowerCase(c);
       }
     } else {
       return Character.toLowerCase(c);
     }
   }

   // TODO: this could be more efficient!
   static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
     final FST.BytesReader bytesReader = fst.getBytesReader();
     final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
     final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();

     // temporary stuff
     final FST.Arc<CharsRef> arc = new FST.Arc<>();
     int longestMatch;
     CharsRef longestOutput;

     for (int i = 0; i < sb.length(); i++) {
       arc.copyFrom(firstArc);
       CharsRef output = NO_OUTPUT;
       longestMatch = -1;
       longestOutput = null;

       for (int j = i; j < sb.length(); j++) {
         char ch = sb.charAt(j);
         if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
           break;
         } else {
           output = fst.outputs.add(output, arc.output);
         }
         if (arc.isFinal()) {
           longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
           longestMatch = j;
         }
       }

       if (longestMatch >= 0) {
         sb.delete(i, longestMatch+1);
         sb.insert(i, longestOutput);
         i += (longestOutput.length - 1);
       }
     }
   }

   /** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
   public boolean getIgnoreCase() {
     return ignoreCase;
   }

   private static Path DEFAULT_TEMP_DIR;

   /** Used by test framework */
   public static void setDefaultTempDir(Path tempDir) {
     DEFAULT_TEMP_DIR = tempDir;
   }

   /**
    * Returns the default temporary directory. By default, java.io.tmpdir. If not accessible
    * or not available, an IOException is thrown
    */
   synchronized static Path getDefaultTempDir() throws IOException {
     if (DEFAULT_TEMP_DIR == null) {
       // Lazy init
       String tempDirPath = System.getProperty("java.io.tmpdir");
       if (tempDirPath == null)  {
         throw new IOException("Java has no temporary folder property (java.io.tmpdir)?");
       }
       Path tempDirectory = Paths.get(tempDirPath);
       if (Files.isWritable(tempDirectory) == false) {
         throw new IOException("Java's temporary folder not present or writeable?: "
                               + tempDirectory.toAbsolutePath());
       }
       DEFAULT_TEMP_DIR = tempDirectory;
     }

     return DEFAULT_TEMP_DIR;
   }
 }