lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java - lucene-solr - Git at Google

 package org.apache.lucene.analysis.hunspell;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.util.Version;

 import java.io.*;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;

 /**
  * In-memory structure for the dictionary (.dic) and affix (.aff)
  * data of a hunspell dictionary.
  */
 public class HunspellDictionary {

   static final HunspellWord NOFLAGS = new HunspellWord();

   private static final String ALIAS_KEY = "AF";
   private static final String PREFIX_KEY = "PFX";
   private static final String SUFFIX_KEY = "SFX";
   private static final String FLAG_KEY = "FLAG";

   private static final String NUM_FLAG_TYPE = "num";
   private static final String UTF8_FLAG_TYPE = "UTF-8";
   private static final String LONG_FLAG_TYPE = "long";

   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

   private static final boolean IGNORE_CASE_DEFAULT = false;
   private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;

   private CharArrayMap<List<HunspellWord>> words;
   private CharArrayMap<List<HunspellAffix>> prefixes;
   private CharArrayMap<List<HunspellAffix>> suffixes;

   private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
   private boolean ignoreCase = IGNORE_CASE_DEFAULT;

   private final Version version;

   private String[] aliases;
   private int aliasCount = 0;

   /**
    * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
    * You have to close the provided InputStreams yourself.
    *
    * @param affix InputStream for reading the hunspell affix file (won't be closed).
    * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
    * @param version Lucene Version
    * @throws IOException Can be thrown while reading from the InputStreams
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
     this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
   }

   /**
    * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
    * You have to close the provided InputStreams yourself.
    *
    * @param affix InputStream for reading the hunspell affix file (won't be closed).
    * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
    * @param version Lucene Version
    * @param ignoreCase If true, dictionary matching will be case insensitive
    * @throws IOException Can be thrown while reading from the InputStreams
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
     this(affix, Arrays.asList(dictionary), version, ignoreCase);
   }

   /**
    * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
    * You have to close the provided InputStreams yourself.
    *
    * @param affix InputStream for reading the hunspell affix file (won't be closed).
    * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
    * @param version Lucene Version
    * @param ignoreCase If true, dictionary matching will be case insensitive
    * @throws IOException Can be thrown while reading from the InputStreams
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
     this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
   }

   /**
    * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
    * You have to close the provided InputStreams yourself.
    *
    * @param affix InputStream for reading the hunspell affix file (won't be closed).
    * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
    * @param version Lucene Version
    * @param ignoreCase If true, dictionary matching will be case insensitive
    * @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
    * @throws IOException Can be thrown while reading from the InputStreams
    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
    */
   public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException {
     this.version = version;
     this.ignoreCase = ignoreCase;
     String encoding = getDictionaryEncoding(affix);
     CharsetDecoder decoder = getJavaEncoding(encoding);
     readAffixFile(affix, decoder, strictAffixParsing);
     words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
     for (InputStream dictionary : dictionaries) {
       readDictionaryFile(dictionary, decoder);
     }
   }

   /**
    * Looks up HunspellWords that match the String created from the given char array, offset and length
    *
    * @param word Char array to generate the String from
    * @param offset Offset in the char array that the String starts at
    * @param length Length from the offset that the String is
    * @return List of HunspellWords that match the generated String, or {@code null} if none are found
    */
   public List<HunspellWord> lookupWord(char word[], int offset, int length) {
     return words.get(word, offset, length);
   }

   /**
    * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
    *
    * @param word Char array to generate the String from
    * @param offset Offset in the char array that the String starts at
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
    */
   public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {
     return prefixes.get(word, offset, length);
   }

   /**
    * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
    *
    * @param word Char array to generate the String from
    * @param offset Offset in the char array that the String starts at
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
    */
   public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {
     return suffixes.get(word, offset, length);
   }

   /**
    * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
    *
    * @param affixStream InputStream to read the content of the affix file from
    * @param decoder CharsetDecoder to decode the content of the file
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException {
     prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
     suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);

     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
     String line = null;
     while ((line = reader.readLine()) != null) {
       if (line.startsWith(ALIAS_KEY)) {
         parseAlias(line);
       } else if (line.startsWith(PREFIX_KEY)) {
         parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
       } else if (line.startsWith(SUFFIX_KEY)) {
         parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
       } else if (line.startsWith(FLAG_KEY)) {
         // Assume that the FLAG line comes before any prefix or suffixes
         // Store the strategy so it can be used when parsing the dic file
         flagParsingStrategy = getFlagParsingStrategy(line);
       }
     }
   }

   /**
    * Parses a specific affix rule putting the result into the provided affix map
    *
    * @param affixes Map where the result of the parsing will be put
    * @param header Header line of the affix rule
    * @param reader BufferedReader to read the content of the rule from
    * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
    *                         pattern
    * @throws IOException Can be thrown while reading the rule
    */
   private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
                           String header,
                           LineNumberReader reader,
                           String conditionPattern,
                           boolean strict) throws IOException, ParseException {
     String args[] = header.split("\\s+");

     boolean crossProduct = args[2].equals("Y");

     int numLines = Integer.parseInt(args[3]);
     for (int i = 0; i < numLines; i++) {
       String line = reader.readLine();
       String ruleArgs[] = line.split("\\s+");

       if (ruleArgs.length < 5) {
         if (strict) {
           throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
         }
         continue;
       }

       HunspellAffix affix = new HunspellAffix();

       affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
       affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);

       String affixArg = ruleArgs[3];

       int flagSep = affixArg.lastIndexOf('/');
       if (flagSep != -1) {
         String flagPart = affixArg.substring(flagSep + 1);

         if (aliasCount > 0) {
           flagPart = getAliasValue(Integer.parseInt(flagPart));
         }

         char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
         Arrays.sort(appendFlags);
         affix.setAppendFlags(appendFlags);
         affix.setAppend(affixArg.substring(0, flagSep));
       } else {
         affix.setAppend(affixArg);
       }

       String condition = ruleArgs[4];
       affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
       affix.setCrossProduct(crossProduct);

       List<HunspellAffix> list = affixes.get(affix.getAppend());
       if (list == null) {
         list = new ArrayList<HunspellAffix>();
         affixes.put(affix.getAppend(), list);
       }

       list.add(affix);
     }
   }

   /**
    * Parses the encoding specified in the affix file readable through the provided InputStream
    *
    * @param affix InputStream for reading the affix file
    * @return Encoding specified in the affix file
    * @throws IOException Can be thrown while reading from the InputStream
    * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
    */
   private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
     final StringBuilder encoding = new StringBuilder();
     for (;;) {
       encoding.setLength(0);
       int ch;
       while ((ch = affix.read()) >= 0) {
         if (ch == '\n') {
           break;
         }
         if (ch != '\r') {
           encoding.append((char)ch);
         }
       }
       if (
           encoding.length() == 0 || encoding.charAt(0) == '#' ||
           // this test only at the end as ineffective but would allow lines only containing spaces:
           encoding.toString().trim().length() == 0
       ) {
         if (ch < 0) {
           throw new ParseException("Unexpected end of affix file.", 0);
         }
         continue;
       }
       if ("SET ".equals(encoding.substring(0, 4))) {
         // cleanup the encoding string, too (whitespace)
         return encoding.substring(4).trim();
       }
       throw new ParseException("The first non-comment line in the affix file must "+
           "be a 'SET charset', was: '" + encoding +"'", 0);
     }
   }

   /**
    * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
    * MICROSOFT-CP1251 etc are allowed...
    *
    * @param encoding Encoding to retrieve the CharsetDecoder for
    * @return CharSetDecoder for the given encoding
    */
   private CharsetDecoder getJavaEncoding(String encoding) {
     Charset charset = Charset.forName(encoding);
     return charset.newDecoder();
   }

   /**
    * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
    *
    * @param flagLine Line containing the flag information
    * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
    */
   private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
     String flagType = flagLine.substring(5);

     if (NUM_FLAG_TYPE.equals(flagType)) {
       return new NumFlagParsingStrategy();
     } else if (UTF8_FLAG_TYPE.equals(flagType)) {
       return new SimpleFlagParsingStrategy();
     } else if (LONG_FLAG_TYPE.equals(flagType)) {
       return new DoubleASCIIFlagParsingStrategy();
     }

     throw new IllegalArgumentException("Unknown flag type: " + flagType);
   }

   /**
    * Reads the dictionary file through the provided InputStream, building up the words map
    *
    * @param dictionary InputStream to read the dictionary file through
    * @param decoder CharsetDecoder used to decode the contents of the file
    * @throws IOException Can be thrown while reading from the file
    */
   private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
     BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
     // TODO: don't create millions of strings.
     String line = reader.readLine(); // first line is number of entries
     int numEntries = Integer.parseInt(line);

     // TODO: the flags themselves can be double-chars (long) or also numeric
     // either way the trick is to encode them as char... but they must be parsed differently
     while ((line = reader.readLine()) != null) {
       String entry;
       HunspellWord wordForm;

       int flagSep = line.lastIndexOf('/');
       if (flagSep == -1) {
         wordForm = NOFLAGS;
         entry = line;
       } else {
         // note, there can be comments (morph description) after a flag.
         // we should really look for any whitespace
         int end = line.indexOf('\t', flagSep);
         if (end == -1)
           end = line.length();

         String flagPart = line.substring(flagSep + 1, end);
         if (aliasCount > 0) {
           flagPart = getAliasValue(Integer.parseInt(flagPart));
         }

         wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
         Arrays.sort(wordForm.getFlags());
         entry = line.substring(0, flagSep);
       }
       if(ignoreCase) {
         entry = entry.toLowerCase(Locale.ROOT);
       }

       List<HunspellWord> entries = new ArrayList<HunspellWord>();
       entries.add(wordForm);
       words.put(entry, entries);
     }
   }

   public Version getVersion() {
     return version;
   }

   private void parseAlias(String line) {
     String ruleArgs[] = line.split("\\s+");
     if (aliases == null) {
       //first line should be the aliases count
       final int count = Integer.parseInt(ruleArgs[1]);
       aliases = new String[count];
     } else {
       aliases[aliasCount++] = ruleArgs[1];
     }
   }

   private String getAliasValue(int id) {
     try {
       return aliases[id - 1];
     } catch (IndexOutOfBoundsException ex) {
       throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
     }
   }

   /**
    * Abstraction of the process of parsing flags taken from the affix and dic files
    */
   private static abstract class FlagParsingStrategy {

     /**
      * Parses the given String into a single flag
      *
      * @param rawFlag String to parse into a flag
      * @return Parsed flag
      */
     char parseFlag(String rawFlag) {
       return parseFlags(rawFlag)[0];
     }

     /**
      * Parses the given String into multiple flags
      *
      * @param rawFlags String to parse into flags
      * @return Parsed flags
      */
     abstract char[] parseFlags(String rawFlags);
   }

   /**
    * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
    * Can be used with both the ASCII and UTF-8 flag types.
    */
   private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
     /**
      * {@inheritDoc}
      */
     @Override
     public char[] parseFlags(String rawFlags) {
       return rawFlags.toCharArray();
     }
   }

   /**
    * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form.  In the case
    * of multiple flags, each number is separated by a comma.
    */
   private static class NumFlagParsingStrategy extends FlagParsingStrategy {
     /**
      * {@inheritDoc}
      */
     @Override
     public char[] parseFlags(String rawFlags) {
       String[] rawFlagParts = rawFlags.trim().split(",");
       char[] flags = new char[rawFlagParts.length];

       for (int i = 0; i < rawFlagParts.length; i++) {
         // note, removing the trailing X/leading I for nepali... what is the rule here?!
         flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
       }

       return flags;
     }
   }

   /**
    * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
    * must be combined into a single character.
    *
    * TODO (rmuir) test
    */
   private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {

     /**
      * {@inheritDoc}
      */
     @Override
     public char[] parseFlags(String rawFlags) {
       if (rawFlags.length() == 0) {
         return new char[0];
       }

       StringBuilder builder = new StringBuilder();
       for (int i = 0; i < rawFlags.length(); i+=2) {
         char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
         builder.append(cookedFlag);
       }

       char flags[] = new char[builder.length()];
       builder.getChars(0, builder.length(), flags, 0);
       return flags;
     }
   }

   public boolean isIgnoreCase() {
     return ignoreCase;
   }
 }
	package org.apache.lucene.analysis.hunspell;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import org.apache.lucene.analysis.util.CharArrayMap;
	import org.apache.lucene.util.Version;

	import java.io.*;
	import java.nio.charset.Charset;
	import java.nio.charset.CharsetDecoder;
	import java.text.ParseException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;
	import java.util.Locale;

	/**
	* In-memory structure for the dictionary (.dic) and affix (.aff)
	* data of a hunspell dictionary.
	*/
	public class HunspellDictionary {

	static final HunspellWord NOFLAGS = new HunspellWord();

	private static final String ALIAS_KEY = "AF";
	private static final String PREFIX_KEY = "PFX";
	private static final String SUFFIX_KEY = "SFX";
	private static final String FLAG_KEY = "FLAG";

	private static final String NUM_FLAG_TYPE = "num";
	private static final String UTF8_FLAG_TYPE = "UTF-8";
	private static final String LONG_FLAG_TYPE = "long";

	private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
	private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

	private static final boolean IGNORE_CASE_DEFAULT = false;
	private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;

	private CharArrayMap<List<HunspellWord>> words;
	private CharArrayMap<List<HunspellAffix>> prefixes;
	private CharArrayMap<List<HunspellAffix>> suffixes;

	private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
	private boolean ignoreCase = IGNORE_CASE_DEFAULT;

	private final Version version;

	private String[] aliases;
	private int aliasCount = 0;

	/**
	* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
	* and dictionary files.
	* You have to close the provided InputStreams yourself.
	*
	* @param affix InputStream for reading the hunspell affix file (won't be closed).
	* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
	* @param version Lucene Version
	* @throws IOException Can be thrown while reading from the InputStreams
	* @throws ParseException Can be thrown if the content of the files does not meet expected formats
	*/
	public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
	this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
	}

	/**
	* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
	* and dictionary files.
	* You have to close the provided InputStreams yourself.
	*
	* @param affix InputStream for reading the hunspell affix file (won't be closed).
	* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
	* @param version Lucene Version
	* @param ignoreCase If true, dictionary matching will be case insensitive
	* @throws IOException Can be thrown while reading from the InputStreams
	* @throws ParseException Can be thrown if the content of the files does not meet expected formats
	*/
	public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
	this(affix, Arrays.asList(dictionary), version, ignoreCase);
	}

	/**
	* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
	* and dictionary files.
	* You have to close the provided InputStreams yourself.
	*
	* @param affix InputStream for reading the hunspell affix file (won't be closed).
	* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
	* @param version Lucene Version
	* @param ignoreCase If true, dictionary matching will be case insensitive
	* @throws IOException Can be thrown while reading from the InputStreams
	* @throws ParseException Can be thrown if the content of the files does not meet expected formats
	*/
	public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
	this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
	}

	/**
	* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
	* and dictionary files.
	* You have to close the provided InputStreams yourself.
	*
	* @param affix InputStream for reading the hunspell affix file (won't be closed).
	* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
	* @param version Lucene Version
	* @param ignoreCase If true, dictionary matching will be case insensitive
	* @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
	* @throws IOException Can be thrown while reading from the InputStreams
	* @throws ParseException Can be thrown if the content of the files does not meet expected formats
	*/
	public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException {
	this.version = version;
	this.ignoreCase = ignoreCase;
	String encoding = getDictionaryEncoding(affix);
	CharsetDecoder decoder = getJavaEncoding(encoding);
	readAffixFile(affix, decoder, strictAffixParsing);
	words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
	for (InputStream dictionary : dictionaries) {
	readDictionaryFile(dictionary, decoder);
	}
	}

	/**
	* Looks up HunspellWords that match the String created from the given char array, offset and length
	*
	* @param word Char array to generate the String from
	* @param offset Offset in the char array that the String starts at
	* @param length Length from the offset that the String is
	* @return List of HunspellWords that match the generated String, or {@code null} if none are found
	*/
	public List<HunspellWord> lookupWord(char word[], int offset, int length) {
	return words.get(word, offset, length);
	}

	/**
	* Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
	*
	* @param word Char array to generate the String from
	* @param offset Offset in the char array that the String starts at
	* @param length Length from the offset that the String is
	* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
	*/
	public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {
	return prefixes.get(word, offset, length);
	}

	/**
	* Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
	*
	* @param word Char array to generate the String from
	* @param offset Offset in the char array that the String starts at
	* @param length Length from the offset that the String is
	* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
	*/
	public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {
	return suffixes.get(word, offset, length);
	}

	/**
	* Reads the affix file through the provided InputStream, building up the prefix and suffix maps
	*
	* @param affixStream InputStream to read the content of the affix file from
	* @param decoder CharsetDecoder to decode the content of the file
	* @throws IOException Can be thrown while reading from the InputStream
	*/
	private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException {
	prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
	suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);

	LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
	String line = null;
	while ((line = reader.readLine()) != null) {
	if (line.startsWith(ALIAS_KEY)) {
	parseAlias(line);
	} else if (line.startsWith(PREFIX_KEY)) {
	parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
	} else if (line.startsWith(SUFFIX_KEY)) {
	parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
	} else if (line.startsWith(FLAG_KEY)) {
	// Assume that the FLAG line comes before any prefix or suffixes
	// Store the strategy so it can be used when parsing the dic file
	flagParsingStrategy = getFlagParsingStrategy(line);
	}
	}
	}

	/**
	* Parses a specific affix rule putting the result into the provided affix map
	*
	* @param affixes Map where the result of the parsing will be put
	* @param header Header line of the affix rule
	* @param reader BufferedReader to read the content of the rule from
	* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
	* pattern
	* @throws IOException Can be thrown while reading the rule
	*/
	private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
	String header,
	LineNumberReader reader,
	String conditionPattern,
	boolean strict) throws IOException, ParseException {
	String args[] = header.split("\\s+");

	boolean crossProduct = args[2].equals("Y");

	int numLines = Integer.parseInt(args[3]);
	for (int i = 0; i < numLines; i++) {
	String line = reader.readLine();
	String ruleArgs[] = line.split("\\s+");

	if (ruleArgs.length < 5) {
	if (strict) {
	throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
	}
	continue;
	}

	HunspellAffix affix = new HunspellAffix();

	affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
	affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);

	String affixArg = ruleArgs[3];

	int flagSep = affixArg.lastIndexOf('/');
	if (flagSep != -1) {
	String flagPart = affixArg.substring(flagSep + 1);

	if (aliasCount > 0) {
	flagPart = getAliasValue(Integer.parseInt(flagPart));
	}

	char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
	Arrays.sort(appendFlags);
	affix.setAppendFlags(appendFlags);
	affix.setAppend(affixArg.substring(0, flagSep));
	} else {
	affix.setAppend(affixArg);
	}

	String condition = ruleArgs[4];
	affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
	affix.setCrossProduct(crossProduct);

	List<HunspellAffix> list = affixes.get(affix.getAppend());
	if (list == null) {
	list = new ArrayList<HunspellAffix>();
	affixes.put(affix.getAppend(), list);
	}

	list.add(affix);
	}
	}

	/**
	* Parses the encoding specified in the affix file readable through the provided InputStream
	*
	* @param affix InputStream for reading the affix file
	* @return Encoding specified in the affix file
	* @throws IOException Can be thrown while reading from the InputStream
	* @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
	*/
	private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
	final StringBuilder encoding = new StringBuilder();
	for (;;) {
	encoding.setLength(0);
	int ch;
	while ((ch = affix.read()) >= 0) {
	if (ch == '\n') {
	break;
	}
	if (ch != '\r') {
	encoding.append((char)ch);
	}
	}
	if (
	encoding.length() == 0 \|\| encoding.charAt(0) == '#' \|\|
	// this test only at the end as ineffective but would allow lines only containing spaces:
	encoding.toString().trim().length() == 0
	) {
	if (ch < 0) {
	throw new ParseException("Unexpected end of affix file.", 0);
	}
	continue;
	}
	if ("SET ".equals(encoding.substring(0, 4))) {
	// cleanup the encoding string, too (whitespace)
	return encoding.substring(4).trim();
	}
	throw new ParseException("The first non-comment line in the affix file must "+
	"be a 'SET charset', was: '" + encoding +"'", 0);
	}
	}

	/**
	* Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
	* MICROSOFT-CP1251 etc are allowed...
	*
	* @param encoding Encoding to retrieve the CharsetDecoder for
	* @return CharSetDecoder for the given encoding
	*/
	private CharsetDecoder getJavaEncoding(String encoding) {
	Charset charset = Charset.forName(encoding);
	return charset.newDecoder();
	}

	/**
	* Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
	*
	* @param flagLine Line containing the flag information
	* @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
	*/
	private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
	String flagType = flagLine.substring(5);

	if (NUM_FLAG_TYPE.equals(flagType)) {
	return new NumFlagParsingStrategy();
	} else if (UTF8_FLAG_TYPE.equals(flagType)) {
	return new SimpleFlagParsingStrategy();
	} else if (LONG_FLAG_TYPE.equals(flagType)) {
	return new DoubleASCIIFlagParsingStrategy();
	}

	throw new IllegalArgumentException("Unknown flag type: " + flagType);
	}

	/**
	* Reads the dictionary file through the provided InputStream, building up the words map
	*
	* @param dictionary InputStream to read the dictionary file through
	* @param decoder CharsetDecoder used to decode the contents of the file
	* @throws IOException Can be thrown while reading from the file
	*/
	private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
	BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
	// TODO: don't create millions of strings.
	String line = reader.readLine(); // first line is number of entries
	int numEntries = Integer.parseInt(line);

	// TODO: the flags themselves can be double-chars (long) or also numeric
	// either way the trick is to encode them as char... but they must be parsed differently
	while ((line = reader.readLine()) != null) {
	String entry;
	HunspellWord wordForm;

	int flagSep = line.lastIndexOf('/');
	if (flagSep == -1) {
	wordForm = NOFLAGS;
	entry = line;
	} else {
	// note, there can be comments (morph description) after a flag.
	// we should really look for any whitespace
	int end = line.indexOf('\t', flagSep);
	if (end == -1)
	end = line.length();

	String flagPart = line.substring(flagSep + 1, end);
	if (aliasCount > 0) {
	flagPart = getAliasValue(Integer.parseInt(flagPart));
	}

	wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
	Arrays.sort(wordForm.getFlags());
	entry = line.substring(0, flagSep);
	}
	if(ignoreCase) {
	entry = entry.toLowerCase(Locale.ROOT);
	}

	List<HunspellWord> entries = new ArrayList<HunspellWord>();
	entries.add(wordForm);
	words.put(entry, entries);
	}
	}

	public Version getVersion() {
	return version;
	}

	private void parseAlias(String line) {
	String ruleArgs[] = line.split("\\s+");
	if (aliases == null) {
	//first line should be the aliases count
	final int count = Integer.parseInt(ruleArgs[1]);
	aliases = new String[count];
	} else {
	aliases[aliasCount++] = ruleArgs[1];
	}
	}

	private String getAliasValue(int id) {
	try {
	return aliases[id - 1];
	} catch (IndexOutOfBoundsException ex) {
	throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
	}
	}

	/**
	* Abstraction of the process of parsing flags taken from the affix and dic files
	*/
	private static abstract class FlagParsingStrategy {

	/**
	* Parses the given String into a single flag
	*
	* @param rawFlag String to parse into a flag
	* @return Parsed flag
	*/
	char parseFlag(String rawFlag) {
	return parseFlags(rawFlag)[0];
	}

	/**
	* Parses the given String into multiple flags
	*
	* @param rawFlags String to parse into flags
	* @return Parsed flags
	*/
	abstract char[] parseFlags(String rawFlags);
	}

	/**
	* Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
	* Can be used with both the ASCII and UTF-8 flag types.
	*/
	private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
	/**
	* {@inheritDoc}
	*/
	@Override
	public char[] parseFlags(String rawFlags) {
	return rawFlags.toCharArray();
	}
	}

	/**
	* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case
	* of multiple flags, each number is separated by a comma.
	*/
	private static class NumFlagParsingStrategy extends FlagParsingStrategy {
	/**
	* {@inheritDoc}
	*/
	@Override
	public char[] parseFlags(String rawFlags) {
	String[] rawFlagParts = rawFlags.trim().split(",");
	char[] flags = new char[rawFlagParts.length];

	for (int i = 0; i < rawFlagParts.length; i++) {
	// note, removing the trailing X/leading I for nepali... what is the rule here?!
	flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
	}

	return flags;
	}
	}

	/**
	* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
	* must be combined into a single character.
	*
	* TODO (rmuir) test
	*/
	private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {

	/**
	* {@inheritDoc}
	*/
	@Override
	public char[] parseFlags(String rawFlags) {
	if (rawFlags.length() == 0) {
	return new char[0];
	}

	StringBuilder builder = new StringBuilder();
	for (int i = 0; i < rawFlags.length(); i+=2) {
	char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
	builder.append(cookedFlag);
	}

	char flags[] = new char[builder.length()];
	builder.getChars(0, builder.length(), flags, 0);
	return flags;
	}
	}

	public boolean isIgnoreCase() {
	return ignoreCase;
	}
	}