| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.IO; |
| using System.Text; |
| using System.Text.RegularExpressions; |
| |
| namespace Lucene.Net.Analysis.Hunspell { |
| public class HunspellDictionary { |
| private static readonly HunspellWord NoFlags = new HunspellWord(); |
| |
| private static readonly String PREFIX_KEY = "PFX"; |
| private static readonly String SUFFIX_KEY = "SFX"; |
| private static readonly String FLAG_KEY = "FLAG"; |
| private static readonly String AF_KEY = "AF"; |
| |
| private static readonly String NUM_FLAG_TYPE = "num"; |
| private static readonly String UTF8_FLAG_TYPE = "UTF-8"; |
| private static readonly String LONG_FLAG_TYPE = "long"; |
| |
| private static readonly String PREFIX_CONDITION_REGEX_PATTERN = @"^{0}"; |
| private static readonly String SUFFIX_CONDITION_REGEX_PATTERN = @"{0}$"; |
| |
| private readonly Dictionary<String, List<HunspellAffix>> _prefixes = new Dictionary<String, List<HunspellAffix>>(); |
| private readonly Dictionary<String, List<HunspellAffix>> _suffixes = new Dictionary<String, List<HunspellAffix>>(); |
| private readonly Dictionary<String, List<HunspellWord>> _words = new Dictionary<String, List<HunspellWord>>(); |
| private readonly Dictionary<String, Char[]> _aliases = new Dictionary<String, Char[]>(); |
| private FlagParsingStrategy _flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy |
| |
| /// <summary> |
| /// Creates a new HunspellDictionary containing the information read from the provided streams to hunspell affix and dictionary file. |
| /// </summary> |
| /// <param name = "affix">Stream for reading the hunspell affix file.</param> |
| /// <param name = "dictionary">Stream for reading the hunspell dictionary file.</param> |
| /// <exception cref = "IOException">Can be thrown while reading from the streams.</exception> |
| /// <exception cref = "InvalidDataException">Can be thrown if the content of the files does not meet expected formats.</exception> |
| public HunspellDictionary(Stream affix, Stream dictionary) |
| : this(affix, new[] { dictionary }) { |
| } |
| |
| /// <summary> |
| /// Creates a new HunspellDictionary containing the information read from the provided streams to hunspell affix and dictionary files. |
| /// </summary> |
| /// <param name = "affix">Stream for reading the hunspell affix file.</param> |
| /// <param name = "dictionaries">Streams for reading the hunspell dictionary file.</param> |
| /// <exception cref = "IOException">Can be thrown while reading from the streams.</exception> |
| /// <exception cref = "InvalidDataException">Can be thrown if the content of the files does not meet expected formats.</exception> |
| public HunspellDictionary(Stream affix, IEnumerable<Stream> dictionaries) { |
| if (affix == null) throw new ArgumentNullException("affix"); |
| if (dictionaries == null) throw new ArgumentNullException("dictionaries"); |
| |
| var encodingName = ReadDictionaryEncoding(affix); |
| var encoding = Encoding.GetEncoding(encodingName); |
| |
| ReadAffixFile(affix, encoding); |
| foreach (var dictionary in dictionaries) |
| ReadDictionaryFile(dictionary, encoding); |
| } |
| |
| /// <summary> |
| /// Looks up HunspellWords that match the String created from the given char array, offset and length. |
| /// </summary> |
| public IEnumerable<HunspellWord> LookupWord(String word) { |
| if (word == null) throw new ArgumentNullException("word"); |
| |
| List<HunspellWord> list; |
| if (_words.TryGetValue(word, out list)) |
| return list; |
| |
| return null; |
| } |
| |
| /// <summary> |
| /// Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length. |
| /// </summary> |
| /// <param name="word">Char array to generate the String from.</param> |
| /// <param name="offset">Offset in the char array that the String starts at.</param> |
| /// <param name="length">Length from the offset that the String is.</param> |
| /// <returns>List of HunspellAffix prefixes with an append that matches the String, or <c>null</c> if none are found.</returns> |
| public IEnumerable<HunspellAffix> LookupPrefix(char[] word, int offset, int length) { |
| if (word == null) throw new ArgumentNullException("word"); |
| var key = new String(word, offset, length); |
| |
| List<HunspellAffix> list; |
| if (_prefixes.TryGetValue(key, out list)) |
| return list; |
| |
| return null; |
| } |
| |
| /// <summary> |
| /// Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length. |
| /// </summary> |
| /// <param name="word">Char array to generate the String from.</param> |
| /// <param name="offset">Offset in the char array that the String starts at.</param> |
| /// <param name="length">Length from the offset that the String is.</param> |
| /// <returns>List of HunspellAffix suffixes with an append that matches the String, or <c>null</c> if none are found</returns> |
| public IEnumerable<HunspellAffix> LookupSuffix(char[] word, int offset, int length) { |
| if (word == null) throw new ArgumentNullException("word"); |
| var key = new String(word, offset, length); |
| |
| List<HunspellAffix> list; |
| if (_suffixes.TryGetValue(key, out list)) |
| return list; |
| |
| return null; |
| } |
| |
| /// <summary> |
| /// Reads the affix file through the provided Stream, building up the prefix and suffix maps. |
| /// </summary> |
| /// <param name="affixStream">Stream to read the content of the affix file from.</param> |
| /// <param name="encoding">Encoding to decode the content of the file.</param> |
| /// <exception cref="IOException">IOException Can be thrown while reading from the Stream.</exception> |
| private void ReadAffixFile(Stream affixStream, Encoding encoding) { |
| if (affixStream == null) throw new ArgumentNullException("affixStream"); |
| if (encoding == null) throw new ArgumentNullException("encoding"); |
| |
| using (var reader = new StreamReader(affixStream, encoding)) { |
| String line; |
| while ((line = reader.ReadLine()) != null) { |
| if (line.StartsWith(PREFIX_KEY)) { |
| ParseAffix(_prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); |
| } else if (line.StartsWith(SUFFIX_KEY)) { |
| ParseAffix(_suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); |
| } else if (line.StartsWith(FLAG_KEY)) { |
| // Assume that the FLAG line comes before any prefix or suffixes |
| // Store the strategy so it can be used when parsing the dic file |
| _flagParsingStrategy = GetFlagParsingStrategy(line); |
| } else if (line.StartsWith(AF_KEY)) { |
| // Parse Alias Flag |
| ParseAliasFlag(line, reader); |
| } |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Parse alias flag and put it in hash |
| /// </summary> |
| /// <param name="line"></param> |
| /// <param name="reader"></param> |
| private void ParseAliasFlag(String line, TextReader reader) { |
| if (reader == null) throw new ArgumentNullException("reader"); |
| var args = Regex.Split(line, "\\s+"); |
| var numLines = Int32.Parse(args[1]); |
| |
| for (var i = 0; i < numLines; i++) { |
| line = reader.ReadLine(); |
| var ruleArgs = Regex.Split(line, "\\s+"); |
| |
| if (ruleArgs[0] != "AF") |
| throw new Exception("File corrupted, should be AF directive : " + line); |
| |
| var appendFlags = _flagParsingStrategy.ParseFlags(ruleArgs[1]); |
| _aliases.Add((i+1).ToString(CultureInfo.InvariantCulture), appendFlags); |
| } |
| } |
| |
| /// <summary> |
| /// Parses a specific affix rule putting the result into the provided affix map. |
| /// </summary> |
| /// <param name="affixes">Map where the result of the parsing will be put.</param> |
| /// <param name="header">Header line of the affix rule.</param> |
| /// <param name="reader">TextReader to read the content of the rule from.</param> |
| /// <param name="conditionPattern">Pattern to be used to generate the condition regex pattern.</param> |
| private void ParseAffix(Dictionary<String, List<HunspellAffix>> affixes, String header, TextReader reader, String conditionPattern) { |
| if (affixes == null) throw new ArgumentNullException("affixes"); |
| if (header == null) throw new ArgumentNullException("header"); |
| if (reader == null) throw new ArgumentNullException("reader"); |
| if (conditionPattern == null) throw new ArgumentNullException("conditionPattern"); |
| |
| var args = Regex.Split(header, "\\s+"); |
| var crossProduct = args[2].Equals("Y"); |
| var numLines = Int32.Parse(args[3]); |
| |
| var hasAliases = _aliases.Count > 0; |
| for (var i = 0; i < numLines; i++) { |
| var line = reader.ReadLine(); |
| var ruleArgs = Regex.Split(line, "\\s+"); |
| |
| var affix = new HunspellAffix(); |
| |
| affix.Flag = _flagParsingStrategy.ParseFlag(ruleArgs[1]); |
| affix.Strip = (ruleArgs[2] == "0") ? "" : ruleArgs[2]; |
| |
| var affixArg = ruleArgs[3]; |
| |
| var flagSep = affixArg.LastIndexOf('/'); |
| if (flagSep != -1) { |
| var cflag = affixArg.Substring(flagSep + 1); |
| var appendFlags = hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag); |
| Array.Sort(appendFlags); |
| affix.AppendFlags = appendFlags; |
| affix.Append = affixArg.Substring(0, flagSep); |
| } else { |
| affix.Append = affixArg; |
| } |
| |
| var condition = ruleArgs[4]; |
| affix.SetCondition(condition, String.Format(conditionPattern, condition)); |
| affix.IsCrossProduct = crossProduct; |
| |
| List<HunspellAffix> list; |
| if (!affixes.TryGetValue(affix.Append, out list)) |
| affixes.Add(affix.Append, list = new List<HunspellAffix>()); |
| |
| list.Add(affix); |
| } |
| } |
| |
| /// <summary> |
| /// Parses the encoding specificed in the affix file readable through the provided Stream. |
| /// </summary> |
| /// <param name="affix">Stream for reading the affix file.</param> |
| /// <returns>Encoding specified in the affix file.</returns> |
| /// <exception cref="InvalidDataException"> |
| /// Thrown if the first non-empty non-comment line read from the file does not |
| /// adhere to the format <c>SET encoding</c>. |
| /// </exception> |
| private static String ReadDictionaryEncoding(Stream affix) { |
| if (affix == null) throw new ArgumentNullException("affix"); |
| |
| var builder = new StringBuilder(); |
| for (; ; ) { |
| builder.Length = 0; |
| int ch; |
| while ((ch = affix.ReadByte()) >= 0) { |
| if (ch == '\n') { |
| break; |
| } |
| if (ch != '\r') { |
| builder.Append((char)ch); |
| } |
| } |
| |
| if (builder.Length == 0 || |
| builder[0] == '#' || |
| // this test only at the end as ineffective but would allow lines only containing spaces: |
| builder.ToString().Trim().Length == 0 |
| ) { |
| if (ch < 0) |
| throw new InvalidDataException("Unexpected end of affix file."); |
| |
| continue; |
| } |
| |
| if ("SET ".Equals(builder.ToString(0, 4))) { |
| // cleanup the encoding string, too (whitespace) |
| return builder.ToString(4, builder.Length - 4).Trim(); |
| } |
| |
| throw new InvalidDataException("The first non-comment line in the affix file must " + |
| "be a 'SET charset', was: '" + builder + "'"); |
| } |
| } |
| |
| /// <summary> |
| /// Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file. |
| /// </summary> |
| /// <param name="flagLine">Line containing the flag information</param> |
| /// <returns>FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition.</returns> |
| private static FlagParsingStrategy GetFlagParsingStrategy(String flagLine) { |
| if (flagLine == null) throw new ArgumentNullException("flagLine"); |
| var flagType = flagLine.Substring(5); |
| |
| if (NUM_FLAG_TYPE.Equals(flagType)) |
| return new NumFlagParsingStrategy(); |
| |
| if (UTF8_FLAG_TYPE.Equals(flagType)) |
| return new SimpleFlagParsingStrategy(); |
| |
| if (LONG_FLAG_TYPE.Equals(flagType)) |
| return new DoubleASCIIFlagParsingStrategy(); |
| |
| throw new ArgumentException("Unknown flag type: " + flagType); |
| } |
| |
| /// <summary> |
| /// Reads the dictionary file through the provided Stream, building up the words map. |
| /// </summary> |
| /// <param name="dictionary">Stream to read the dictionary file through.</param> |
| /// <param name="encoding">Encoding used to decode the contents of the file.</param> |
| /// <exception cref="IOException">Can be thrown while reading from the file.</exception> |
| private void ReadDictionaryFile(Stream dictionary, Encoding encoding) { |
| if (dictionary == null) throw new ArgumentNullException("dictionary"); |
| if (encoding == null) throw new ArgumentNullException("encoding"); |
| var reader = new StreamReader(dictionary, encoding); |
| |
| // nocommit, don't create millions of strings. |
| var line = reader.ReadLine(); // first line is number of entries |
| var numEntries = Int32.Parse(line); |
| var hasAliases = _aliases.Count > 0; |
| |
| // nocommit, the flags themselves can be double-chars (long) or also numeric |
| // either way the trick is to encode them as char... but they must be parsed differently |
| while ((line = reader.ReadLine()) != null) { |
| String entry; |
| HunspellWord wordForm; |
| |
| var flagSep = line.LastIndexOf('/'); |
| if (flagSep == -1) { |
| wordForm = NoFlags; |
| entry = line; |
| } else { |
| // note, there can be comments (morph description) after a flag. |
| // we should really look for any whitespace |
| var end = line.IndexOf('\t', flagSep); |
| var cflag = end == -1 ? line.Substring(flagSep + 1) : line.Substring(flagSep + 1, end - flagSep - 1); |
| |
| wordForm = new HunspellWord(hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag)); |
| |
| entry = line.Substring(0, flagSep); |
| } |
| |
| List<HunspellWord> entries; |
| if (!_words.TryGetValue(entry, out entries)) |
| _words.Add(entry, entries = new List<HunspellWord>()); |
| |
| entries.Add(wordForm); |
| } |
| } |
| |
| #region Nested type: DoubleASCIIFlagParsingStrategy |
| |
| /// <summary> |
| /// Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as |
| /// two ASCII characters whose codes must be combined into a single character. |
| /// </summary> |
| private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy { |
| public override Char[] ParseFlags(String rawFlags) { |
| if (rawFlags.Length == 0) |
| return new Char[0]; |
| |
| var builder = new StringBuilder(); |
| for (var i = 0; i < rawFlags.Length; i += 2) { |
| var cookedFlag = (Char)(rawFlags[i] + rawFlags[i + 1]); |
| builder.Append(cookedFlag); |
| } |
| |
| return builder.ToString().ToCharArray(); |
| } |
| } |
| |
| #endregion |
| |
| #region Nested type: FlagParsingStrategy |
| /// <summary> |
| /// Abstraction of the process of parsing flags taken from the affix and dic files |
| /// </summary> |
| private abstract class FlagParsingStrategy { |
| /// <summary> |
| /// Parses the given String into a single flag. |
| /// </summary> |
| /// <param name="rawFlag">String to parse into a flag.</param> |
| /// <returns>Parsed flag.</returns> |
| public Char ParseFlag(String rawFlag) { |
| if (rawFlag == null) |
| throw new ArgumentNullException("rawFlag"); |
| |
| return ParseFlags(rawFlag)[0]; |
| } |
| |
| /// <summary> |
| /// Parses the given String into multiple flag. |
| /// </summary> |
| /// <param name="rawFlags">String to parse into a flags.</param> |
| /// <returns>Parsed flags.</returns> |
| public abstract Char[] ParseFlags(String rawFlags); |
| } |
| |
| #endregion |
| |
| #region Nested type: NumFlagParsingStrategy |
| |
| /// <summary> |
| /// Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its |
| /// numerical form. In the case of multiple flags, each number is separated by a comma. |
| /// </summary> |
| private class NumFlagParsingStrategy : FlagParsingStrategy { |
| public override Char[] ParseFlags(String rawFlags) { |
| var rawFlagParts = rawFlags.Trim().Split(','); |
| var flags = new Char[rawFlagParts.Length]; |
| |
| for (var i = 0; i < rawFlagParts.Length; i++) { |
| // note, removing the trailing X/leading I for nepali... what is the rule here?! |
| var replaced = Regex.Replace(rawFlagParts[i], "[^0-9]", ""); |
| flags[i] = (Char)Int32.Parse(replaced); |
| } |
| |
| return flags; |
| } |
| } |
| |
| #endregion |
| |
| #region Nested type: SimpleFlagParsingStrategy |
| |
| /// <summary> |
| /// Simple implementation of {@link FlagParsingStrategy} that treats the chars in each |
| /// String as a individual flags. Can be used with both the ASCII and UTF-8 flag types. |
| /// </summary> |
| private class SimpleFlagParsingStrategy : FlagParsingStrategy { |
| public override Char[] ParseFlags(String rawFlags) { |
| return rawFlags.ToCharArray(); |
| } |
| } |
| |
| #endregion |
| } |
| } |