blob: 49b04c28aef9f49b9d45b7a3700ad668d6262b94 [file] [log] [blame]
using Lucene.Net.Store;
using Lucene.Net.Support;
using Lucene.Net.Support.IO;
using Lucene.Net.Util;
using Lucene.Net.Util.Automaton;
using Lucene.Net.Util.Fst;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace Lucene.Net.Analysis.Hunspell
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// In-memory structure for the dictionary (.dic) and affix (.aff)
/// data of a hunspell dictionary.
/// </summary>
public class Dictionary
{
private static readonly char[] NOFLAGS = new char[0];
private const string ALIAS_KEY = "AF";
private const string PREFIX_KEY = "PFX";
private const string SUFFIX_KEY = "SFX";
private const string FLAG_KEY = "FLAG";
private const string COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
private const string CIRCUMFIX_KEY = "CIRCUMFIX";
private const string IGNORE_KEY = "IGNORE";
private const string ICONV_KEY = "ICONV";
private const string OCONV_KEY = "OCONV";
private const string NUM_FLAG_TYPE = "num";
private const string UTF8_FLAG_TYPE = "UTF-8";
private const string LONG_FLAG_TYPE = "long";
// TODO: really for suffixes we should reverse the automaton and run them backwards
private const string PREFIX_CONDITION_REGEX_PATTERN = "{0}.*";
private const string SUFFIX_CONDITION_REGEX_PATTERN = ".*{0}";
internal FST<Int32sRef> prefixes;
internal FST<Int32sRef> suffixes;
// all condition checks used by prefixes and suffixes. these are typically re-used across
// many affix stripping rules. so these are deduplicated, to save RAM.
internal List<CharacterRunAutomaton> patterns = new List<CharacterRunAutomaton>();
// the entries in the .dic file, mapping to their set of flags.
// the fst output is the ordinal list for flagLookup
internal FST<Int32sRef> words;
// the list of unique flagsets (wordforms). theoretically huge, but practically
// small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
internal BytesRefHash flagLookup = new BytesRefHash();
// the list of unique strip affixes.
internal char[] stripData;
internal int[] stripOffsets;
// 8 bytes per affix
internal byte[] affixData = new byte[64];
private int currentAffix = 0;
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
private string[] aliases;
private int aliasCount = 0;
private readonly DirectoryInfo tempDir = OfflineSorter.DefaultTempDir(); // TODO: make this configurable?
internal bool ignoreCase;
internal bool complexPrefixes;
internal bool twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
internal int circumfix = -1; // circumfix flag, or -1 if one is not defined
// ignored characters (dictionary, affix, inputs)
private char[] ignore;
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
internal FST<CharsRef> iconv;
internal FST<CharsRef> oconv;
internal bool needsInputCleaning;
internal bool needsOutputCleaning;
// LUCENENET: Added so we can get better performance than creating the regex in every tight loop.
private static Regex whitespacePattern = new Regex("\\s+", RegexOptions.Compiled);
/// <summary>
/// Creates a new <see cref="Dictionary"/> containing the information read from the provided <see cref="Stream"/>s to hunspell affix
/// and dictionary files.
/// You have to dispose the provided <see cref="Stream"/>s yourself.
/// </summary>
/// <param name="affix"> <see cref="Stream"/> for reading the hunspell affix file (won't be disposed). </param>
/// <param name="dictionary"> <see cref="Stream"/> for reading the hunspell dictionary file (won't be disposed). </param>
/// <exception cref="IOException"> Can be thrown while reading from the <see cref="Stream"/>s </exception>
/// <exception cref="Exception"> Can be thrown if the content of the files does not meet expected formats </exception>
public Dictionary(Stream affix, Stream dictionary)
: this(affix, new List<Stream>() { dictionary }, false)
{
}
/// <summary>
/// Creates a new <see cref="Dictionary"/> containing the information read from the provided <see cref="Stream"/>s to hunspell affix
/// and dictionary files.
/// You have to dispose the provided <see cref="Stream"/>s yourself.
/// </summary>
/// <param name="affix"> <see cref="Stream"/> for reading the hunspell affix file (won't be disposed). </param>
/// <param name="dictionaries"> <see cref="Stream"/> for reading the hunspell dictionary files (won't be disposed). </param>
/// <param name="ignoreCase"> ignore case? </param>
/// <exception cref="IOException"> Can be thrown while reading from the <see cref="Stream"/>s </exception>
/// <exception cref="Exception"> Can be thrown if the content of the files does not meet expected formats </exception>
public Dictionary(Stream affix, IList<Stream> dictionaries, bool ignoreCase)
{
this.ignoreCase = ignoreCase;
this.needsInputCleaning = ignoreCase;
this.needsOutputCleaning = false; // set if we have an OCONV
flagLookup.Add(new BytesRef()); // no flags -> ord 0
FileInfo aff = FileSupport.CreateTempFile("affix", "aff", tempDir);
using (Stream @out = aff.Open(FileMode.Open, FileAccess.ReadWrite))
{
// copy contents of affix stream to temp file
affix.CopyTo(@out);
}
// pass 1: get encoding
string encoding;
using (Stream aff1 = aff.Open(FileMode.Open, FileAccess.Read))
{
encoding = GetDictionaryEncoding(aff1);
}
// pass 2: parse affixes
Encoding decoder = GetSystemEncoding(encoding);
using (Stream aff2 = aff.Open(FileMode.Open, FileAccess.Read))
{
ReadAffixFile(aff2, decoder);
}
// read dictionary entries
Int32SequenceOutputs o = Int32SequenceOutputs.Singleton;
Builder<Int32sRef> b = new Builder<Int32sRef>(FST.INPUT_TYPE.BYTE4, o);
ReadDictionaryFiles(dictionaries, decoder, b);
words = b.Finish();
aliases = null; // no longer needed
try
{
aff.Delete();
}
catch
{
// ignore
}
}
/// <summary>
/// Looks up Hunspell word forms from the dictionary
/// </summary>
internal virtual Int32sRef LookupWord(char[] word, int offset, int length)
{
return Lookup(words, word, offset, length);
}
/// <summary>
/// Looks up HunspellAffix prefixes that have an append that matches the <see cref="string"/> created from the given <see cref="char"/> array, offset and length
/// </summary>
/// <param name="word"> <see cref="char"/> array to generate the <see cref="string"/> from </param>
/// <param name="offset"> Offset in the <see cref="char"/> array that the <see cref="string"/> starts at </param>
/// <param name="length"> Length from the offset that the <see cref="string"/> is </param>
/// <returns> List of HunspellAffix prefixes with an append that matches the <see cref="string"/>, or <c>null</c> if none are found </returns>
internal virtual Int32sRef LookupPrefix(char[] word, int offset, int length)
{
return Lookup(prefixes, word, offset, length);
}
/// <summary>
/// Looks up HunspellAffix suffixes that have an append that matches the <see cref="string"/> created from the given <see cref="char"/> array, offset and length
/// </summary>
/// <param name="word"> <see cref="char"/> array to generate the <see cref="string"/> from </param>
/// <param name="offset"> Offset in the char array that the <see cref="string"/> starts at </param>
/// <param name="length"> Length from the offset that the <see cref="string"/> is </param>
/// <returns> List of HunspellAffix suffixes with an append that matches the <see cref="string"/>, or <c>null</c> if none are found </returns>
internal virtual Int32sRef LookupSuffix(char[] word, int offset, int length)
{
return Lookup(suffixes, word, offset, length);
}
// TODO: this is pretty stupid, considering how the stemming algorithm works
// we can speed it up to be significantly faster!
internal virtual Int32sRef Lookup(FST<Int32sRef> fst, char[] word, int offset, int length)
{
if (fst == null)
{
return null;
}
FST.BytesReader bytesReader = fst.GetBytesReader();
FST.Arc<Int32sRef> arc = fst.GetFirstArc(new FST.Arc<Int32sRef>());
// Accumulate output as we go
Int32sRef NO_OUTPUT = fst.Outputs.NoOutput;
Int32sRef output = NO_OUTPUT;
int l = offset + length;
try
{
for (int i = offset, cp = 0; i < l; i += Character.CharCount(cp))
{
cp = Character.CodePointAt(word, i, l);
if (fst.FindTargetArc(cp, arc, arc, bytesReader) == null)
{
return null;
}
else if (arc.Output != NO_OUTPUT)
{
output = fst.Outputs.Add(output, arc.Output);
}
}
if (fst.FindTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null)
{
return null;
}
else if (arc.Output != NO_OUTPUT)
{
return fst.Outputs.Add(output, arc.Output);
}
else
{
return output;
}
}
catch (IOException bogus)
{
throw new Exception(bogus.Message, bogus);
}
}
/// <summary>
/// Reads the affix file through the provided <see cref="Stream"/>, building up the prefix and suffix maps
/// </summary>
/// <param name="affixStream"> <see cref="Stream"/> to read the content of the affix file from </param>
/// <param name="decoder"> <see cref="Encoding"/> to decode the content of the file </param>
/// <exception cref="IOException"> Can be thrown while reading from the InputStream </exception>
private void ReadAffixFile(Stream affixStream, Encoding decoder)
{
SortedDictionary<string, IList<char?>> prefixes = new SortedDictionary<string, IList<char?>>(StringComparer.Ordinal);
SortedDictionary<string, IList<char?>> suffixes = new SortedDictionary<string, IList<char?>>(StringComparer.Ordinal);
IDictionary<string, int?> seenPatterns = new Dictionary<string, int?>();
// zero condition -> 0 ord
seenPatterns[".*"] = 0;
patterns.Add(null);
// zero strip -> 0 ord
IDictionary<string, int?> seenStrips = new LinkedHashMap<string, int?>();
seenStrips[""] = 0;
var reader = new StreamReader(affixStream, decoder);
string line = null;
int lineNumber = 0;
while ((line = reader.ReadLine()) != null)
{
lineNumber++;
// ignore any BOM marker on first line
if (lineNumber == 1 && line.StartsWith("\uFEFF", StringComparison.Ordinal))
{
line = line.Substring(1);
}
if (line.StartsWith(ALIAS_KEY, StringComparison.Ordinal))
{
ParseAlias(line);
}
else if (line.StartsWith(PREFIX_KEY, StringComparison.Ordinal))
{
ParseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
}
else if (line.StartsWith(SUFFIX_KEY, StringComparison.Ordinal))
{
ParseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
}
else if (line.StartsWith(FLAG_KEY, StringComparison.Ordinal))
{
// Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file
flagParsingStrategy = GetFlagParsingStrategy(line);
}
else if (line.Equals(COMPLEXPREFIXES_KEY, StringComparison.Ordinal))
{
complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
}
else if (line.StartsWith(CIRCUMFIX_KEY, StringComparison.Ordinal))
{
string[] parts = whitespacePattern.Split(line).TrimEnd();
if (parts.Length != 2)
{
throw new Exception(string.Format("Illegal CIRCUMFIX declaration, line {0}", lineNumber));
}
circumfix = flagParsingStrategy.ParseFlag(parts[1]);
}
else if (line.StartsWith(IGNORE_KEY, StringComparison.Ordinal))
{
string[] parts = whitespacePattern.Split(line).TrimEnd();
if (parts.Length != 2)
{
throw new Exception(string.Format("Illegal IGNORE declaration, line {0}", lineNumber));
}
ignore = parts[1].ToCharArray();
Array.Sort(ignore);
needsInputCleaning = true;
}
else if (line.StartsWith(ICONV_KEY, StringComparison.Ordinal) || line.StartsWith(OCONV_KEY, StringComparison.Ordinal))
{
string[] parts = whitespacePattern.Split(line).TrimEnd();
string type = parts[0];
if (parts.Length != 2)
{
throw new Exception(string.Format("Illegal {0} declaration, line {1}", type, lineNumber));
}
int num = int.Parse(parts[1], CultureInfo.InvariantCulture);
FST<CharsRef> res = ParseConversions(reader, num);
if (type.Equals("ICONV", StringComparison.Ordinal))
{
iconv = res;
needsInputCleaning |= iconv != null;
}
else
{
oconv = res;
needsOutputCleaning |= oconv != null;
}
}
}
this.prefixes = AffixFST(prefixes);
this.suffixes = AffixFST(suffixes);
int totalChars = 0;
foreach (string strip in seenStrips.Keys)
{
totalChars += strip.Length;
}
stripData = new char[totalChars];
stripOffsets = new int[seenStrips.Count + 1];
int currentOffset = 0;
int currentIndex = 0;
foreach (string strip in seenStrips.Keys)
{
stripOffsets[currentIndex++] = currentOffset;
strip.CopyTo(0, stripData, currentOffset, strip.Length - 0);
currentOffset += strip.Length;
}
Debug.Assert(currentIndex == seenStrips.Count);
stripOffsets[currentIndex] = currentOffset;
}
private FST<Int32sRef> AffixFST(SortedDictionary<string, IList<char?>> affixes)
{
Int32SequenceOutputs outputs = Int32SequenceOutputs.Singleton;
Builder<Int32sRef> builder = new Builder<Int32sRef>(FST.INPUT_TYPE.BYTE4, outputs);
Int32sRef scratch = new Int32sRef();
foreach (KeyValuePair<string, IList<char?>> entry in affixes)
{
Lucene.Net.Util.Fst.Util.ToUTF32(entry.Key, scratch);
IList<char?> entries = entry.Value;
Int32sRef output = new Int32sRef(entries.Count);
foreach (char? c in entries)
{
output.Int32s[output.Length++] = c.HasValue ? c.Value : 0;
}
builder.Add(scratch, output);
}
return builder.Finish();
}
/// <summary>
/// Parses a specific affix rule putting the result into the provided affix map
/// </summary>
/// <param name="affixes"> <see cref="SortedDictionary{TKey, TValue}"/> where the result of the parsing will be put </param>
/// <param name="header"> Header line of the affix rule </param>
/// <param name="reader"> <see cref="TextReader"/> to read the content of the rule from </param>
/// <param name="conditionPattern"> <see cref="string.Format(string, object[])"/> pattern to be used to generate the condition regex
/// pattern </param>
/// <param name="seenPatterns"> map from condition -> index of patterns, for deduplication. </param>
/// <param name="seenStrips"></param>
/// <exception cref="IOException"> Can be thrown while reading the rule </exception>
private void ParseAffix(SortedDictionary<string, IList<char?>> affixes, string header, TextReader reader, string conditionPattern, IDictionary<string, int?> seenPatterns, IDictionary<string, int?> seenStrips)
{
BytesRef scratch = new BytesRef();
StringBuilder sb = new StringBuilder();
string[] args = whitespacePattern.Split(header).TrimEnd();
bool crossProduct = args[2].Equals("Y", StringComparison.Ordinal);
int numLines = int.Parse(args[3], CultureInfo.InvariantCulture);
affixData = ArrayUtil.Grow(affixData, (currentAffix << 3) + (numLines << 3));
ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
for (int i = 0; i < numLines; i++)
{
Debug.Assert(affixWriter.Position == currentAffix << 3);
string line = reader.ReadLine();
string[] ruleArgs = whitespacePattern.Split(line).TrimEnd();
// from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
// condition is optional
if (ruleArgs.Length < 4)
{
throw new Exception("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader
}
char flag = flagParsingStrategy.ParseFlag(ruleArgs[1]);
string strip = ruleArgs[2].Equals("0", StringComparison.Ordinal) ? "" : ruleArgs[2];
string affixArg = ruleArgs[3];
char[] appendFlags = null;
int flagSep = affixArg.LastIndexOf('/');
if (flagSep != -1)
{
string flagPart = affixArg.Substring(flagSep + 1);
affixArg = affixArg.Substring(0, flagSep - 0);
if (aliasCount > 0)
{
flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture));
}
appendFlags = flagParsingStrategy.ParseFlags(flagPart);
Array.Sort(appendFlags);
twoStageAffix = true;
}
// TODO: add test and fix zero-affix handling!
string condition = ruleArgs.Length > 4 ? ruleArgs[4] : ".";
// at least the gascon affix file has this issue
if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal))
{
condition = condition + "]";
}
// "dash hasn't got special meaning" (we must escape it)
if (condition.IndexOf('-') >= 0)
{
condition = condition.Replace("-", "\\-");
}
string regex;
if (".".Equals(condition, StringComparison.Ordinal))
{
regex = ".*"; // Zero condition is indicated by dot
}
else if (condition.Equals(strip, StringComparison.Ordinal))
{
regex = ".*"; // TODO: optimize this better:
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
// but this is complicated...
}
else
{
regex = string.Format(CultureInfo.InvariantCulture, conditionPattern, condition);
}
// deduplicate patterns
if (!seenPatterns.TryGetValue(regex, out int? patternIndex) || patternIndex == null)
{
patternIndex = patterns.Count;
if (patternIndex > short.MaxValue)
{
throw new System.NotSupportedException("Too many patterns, please report this to dev@lucene.apache.org");
}
seenPatterns[regex] = patternIndex;
CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExpSyntax.NONE)).ToAutomaton());
patterns.Add(pattern);
}
if (!seenStrips.TryGetValue(strip, out int? stripOrd) || stripOrd == null)
{
stripOrd = seenStrips.Count;
seenStrips[strip] = stripOrd;
if (stripOrd > char.MaxValue)
{
throw new System.NotSupportedException("Too many unique strips, please report this to dev@lucene.apache.org");
}
}
if (appendFlags == null)
{
appendFlags = NOFLAGS;
}
EncodeFlags(scratch, appendFlags);
int appendFlagsOrd = flagLookup.Add(scratch);
if (appendFlagsOrd < 0)
{
// already exists in our hash
appendFlagsOrd = (-appendFlagsOrd) - 1;
}
else if (appendFlagsOrd > short.MaxValue)
{
// this limit is probably flexible, but its a good sanity check too
throw new System.NotSupportedException("Too many unique append flags, please report this to dev@lucene.apache.org");
}
affixWriter.WriteInt16((short)flag);
affixWriter.WriteInt16((short)stripOrd);
// encode crossProduct into patternIndex
int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0);
affixWriter.WriteInt16((short)patternOrd);
affixWriter.WriteInt16((short)appendFlagsOrd);
if (needsInputCleaning)
{
string cleaned = CleanInput(affixArg, sb);
affixArg = cleaned.ToString();
}
if (!affixes.TryGetValue(affixArg, out IList<char?> list) || list == null)
{
affixes[affixArg] = list = new List<char?>();
}
list.Add((char)currentAffix);
currentAffix++;
}
}
private FST<CharsRef> ParseConversions(TextReader reader, int num)
{
IDictionary<string, string> mappings = new SortedDictionary<string, string>(StringComparer.Ordinal);
for (int i = 0; i < num; i++)
{
string line = reader.ReadLine();
string[] parts = whitespacePattern.Split(line).TrimEnd();
if (parts.Length != 3)
{
throw new Exception("invalid syntax: " + line /*, reader.LineNumber */); // LUCENENET TODO: LineNumberReader
}
if (mappings.Put(parts[1], parts[2]) != null)
{
throw new System.InvalidOperationException("duplicate mapping specified for: " + parts[1]);
}
}
Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
Int32sRef scratchInts = new Int32sRef();
foreach (KeyValuePair<string, string> entry in mappings)
{
Lucene.Net.Util.Fst.Util.ToUTF16(entry.Key, scratchInts);
builder.Add(scratchInts, new CharsRef(entry.Value));
}
return builder.Finish();
}
/// <summary>
/// pattern accepts optional BOM + SET + any whitespace </summary>
internal static readonly Regex ENCODING_PATTERN = new Regex("^(\u00EF\u00BB\u00BF)?SET\\s+", RegexOptions.Compiled);
/// <summary>
/// Parses the encoding specified in the affix file readable through the provided <see cref="Stream"/>
/// </summary>
/// <param name="affix"> <see cref="Stream"/> for reading the affix file </param>
/// <returns> Encoding specified in the affix file </returns>
/// <exception cref="IOException"> Can be thrown while reading from the <see cref="Stream"/> </exception>
/// <exception cref="Exception"> Thrown if the first non-empty non-comment line read from the file does not adhere to the format <c>SET &lt;encoding&gt;</c></exception>
internal static string GetDictionaryEncoding(Stream affix)
{
StringBuilder encoding = new StringBuilder();
for (;;)
{
encoding.Length = 0;
int ch;
while ((ch = affix.ReadByte()) > 0)
{
if (ch == '\n')
{
break;
}
if (ch != '\r')
{
encoding.Append((char)ch);
}
}
if (encoding.Length == 0 || encoding[0] == '#' || encoding.ToString().Trim().Length == 0)
{
// this test only at the end as ineffective but would allow lines only containing spaces:
if (ch < 0)
{
throw new Exception("Unexpected end of affix file." /*, 0*/);
}
continue;
}
Match matcher = ENCODING_PATTERN.Match(encoding.ToString());
if (matcher.Success)
{
int last = matcher.Index + matcher.Length;
return encoding.ToString(last, encoding.Length - last).Trim();
}
}
}
internal static readonly IDictionary<string, string> CHARSET_ALIASES = LoadCharsetAliases();
private static IDictionary<string, string> LoadCharsetAliases() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
{
IDictionary<string, string> m = new Dictionary<string, string>
{
["microsoft-cp1251"] = "windows-1251",
["TIS620-2533"] = "TIS-620"
};
return Collections.UnmodifiableMap(m);
}
/// <summary>
/// Retrieves the <see cref="Encoding"/> for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
/// MICROSOFT-CP1251 etc are allowed...
/// </summary>
/// <param name="encoding"> Encoding to retrieve the <see cref="Encoding"/> instance for </param>
/// <returns> <see cref="Encoding"/> for the given encoding <see cref="string"/> </returns>
// LUCENENET NOTE: This was getJavaEncoding in the original
private Encoding GetSystemEncoding(string encoding)
{
if (string.IsNullOrEmpty(encoding))
{
return Encoding.UTF8;
}
if ("ISO8859-14".Equals(encoding, StringComparison.OrdinalIgnoreCase))
{
return new ISO8859_14Encoding();
}
// .NET doesn't recognize the encoding without a dash between ISO and the number
// https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx
if (encoding.Length > 3 && encoding.StartsWith("ISO", StringComparison.OrdinalIgnoreCase) &&
encoding[3] != '-')
{
encoding = "iso-" + encoding.Substring(3);
}
// Special case - for codepage 1250-1258, we need to change to
// windows-1251, etc.
else if (windowsCodePagePattern.IsMatch(encoding))
{
encoding = "windows-" + windowsCodePagePattern.Match(encoding).Groups[1].Value;
}
// Special case - for Thai we need to switch to windows-874
else if (thaiCodePagePattern.IsMatch(encoding))
{
encoding = "windows-874";
}
return Encoding.GetEncoding(encoding);
}
private static Regex windowsCodePagePattern = new Regex("^(?:microsoft-)?cp-?(125[0-8])$", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant);
private static Regex thaiCodePagePattern = new Regex("^tis-?620(?:-?2533)?$", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant);
/// <summary>
/// Determines the appropriate <see cref="FlagParsingStrategy"/> based on the FLAG definition line taken from the affix file
/// </summary>
/// <param name="flagLine"> Line containing the flag information </param>
/// <returns> <see cref="FlagParsingStrategy"/> that handles parsing flags in the way specified in the FLAG definition </returns>
internal static FlagParsingStrategy GetFlagParsingStrategy(string flagLine)
{
string[] parts = whitespacePattern.Split(flagLine).TrimEnd();
if (parts.Length != 2)
{
throw new System.ArgumentException("Illegal FLAG specification: " + flagLine);
}
string flagType = parts[1];
if (NUM_FLAG_TYPE.Equals(flagType, StringComparison.Ordinal))
{
return new NumFlagParsingStrategy();
}
else if (UTF8_FLAG_TYPE.Equals(flagType, StringComparison.Ordinal))
{
return new SimpleFlagParsingStrategy();
}
else if (LONG_FLAG_TYPE.Equals(flagType, StringComparison.Ordinal))
{
return new DoubleASCIIFlagParsingStrategy();
}
throw new System.ArgumentException("Unknown flag type: " + flagType);
}
internal readonly char FLAG_SEPARATOR = (char)0x1f; // flag separator after escaping
internal virtual string UnescapeEntry(string entry)
{
StringBuilder sb = new StringBuilder();
for (int i = 0; i < entry.Length; i++)
{
char ch = entry[i];
if (ch == '\\' && i + 1 < entry.Length)
{
sb.Append(entry[i + 1]);
i++;
}
else if (ch == '/')
{
sb.Append(FLAG_SEPARATOR);
}
else
{
sb.Append(ch);
}
}
return sb.ToString();
}
/// <summary>
/// Reads the dictionary file through the provided <see cref="Stream"/>s, building up the words map
/// </summary>
/// <param name="dictionaries"> <see cref="Stream"/>s to read the dictionary file through </param>
/// <param name="decoder"> <see cref="Encoding"/> used to decode the contents of the file </param>
/// <param name="words"></param>
/// <exception cref="IOException"> Can be thrown while reading from the file </exception>
private void ReadDictionaryFiles(IList<Stream> dictionaries, Encoding decoder, Builder<Int32sRef> words)
{
BytesRef flagsScratch = new BytesRef();
Int32sRef scratchInts = new Int32sRef();
StringBuilder sb = new StringBuilder();
FileInfo unsorted = FileSupport.CreateTempFile("unsorted", "dat", tempDir);
using (OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(unsorted))
{
foreach (Stream dictionary in dictionaries)
{
var lines = new StreamReader(dictionary, decoder);
string line = lines.ReadLine(); // first line is number of entries (approximately, sometimes)
while ((line = lines.ReadLine()) != null)
{
line = UnescapeEntry(line);
if (needsInputCleaning)
{
int flagSep = line.LastIndexOf(FLAG_SEPARATOR);
if (flagSep == -1)
{
string cleansed = CleanInput(line, sb);
writer.Write(cleansed.ToString().GetBytes(Encoding.UTF8));
}
else
{
string text = line.Substring(0, flagSep - 0);
string cleansed = CleanInput(text, sb);
if (cleansed != sb.ToString())
{
sb.Length = 0;
sb.Append(cleansed);
}
sb.Append(line.Substring(flagSep));
writer.Write(sb.ToString().GetBytes(Encoding.UTF8));
}
}
else
{
writer.Write(line.GetBytes(Encoding.UTF8));
}
}
}
}
FileInfo sorted = FileSupport.CreateTempFile("sorted", "dat", tempDir);
OfflineSorter sorter = new OfflineSorter(new ComparerAnonymousInnerClassHelper(this));
sorter.Sort(unsorted, sorted);
try
{
unsorted.Delete();
}
catch
{
// ignore
}
using (OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(sorted))
{
BytesRef scratchLine = new BytesRef();
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
string currentEntry = null;
Int32sRef currentOrds = new Int32sRef();
string line2;
while (reader.Read(scratchLine))
{
line2 = scratchLine.Utf8ToString();
string entry;
char[] wordForm;
int flagSep = line2.LastIndexOf(FLAG_SEPARATOR);
if (flagSep == -1)
{
wordForm = NOFLAGS;
entry = line2;
}
else
{
// note, there can be comments (morph description) after a flag.
// we should really look for any whitespace: currently just tab and space
int end = line2.IndexOf('\t', flagSep);
if (end == -1)
{
end = line2.Length;
}
int end2 = line2.IndexOf(' ', flagSep);
if (end2 == -1)
{
end2 = line2.Length;
}
end = Math.Min(end, end2);
string flagPart = line2.Substring(flagSep + 1, end - (flagSep + 1));
if (aliasCount > 0)
{
flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture));
}
wordForm = flagParsingStrategy.ParseFlags(flagPart);
Array.Sort(wordForm);
entry = line2.Substring(0, flagSep - 0);
}
// LUCENENET NOTE: CompareToOrdinal is an extension method that works similarly to
// Java's String.compareTo method.
int cmp = currentEntry == null ? 1 : entry.CompareToOrdinal(currentEntry);
if (cmp < 0)
{
throw new System.ArgumentException("out of order: " + entry + " < " + currentEntry);
}
else
{
EncodeFlags(flagsScratch, wordForm);
int ord = flagLookup.Add(flagsScratch);
if (ord < 0)
{
// already exists in our hash
ord = (-ord) - 1;
}
// finalize current entry, and switch "current" if necessary
if (cmp > 0 && currentEntry != null)
{
Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts);
words.Add(scratchInts, currentOrds);
}
// swap current
if (cmp > 0 || currentEntry == null)
{
currentEntry = entry;
currentOrds = new Int32sRef(); // must be this way
}
currentOrds.Grow(currentOrds.Length + 1);
currentOrds.Int32s[currentOrds.Length++] = ord;
}
}
// finalize last entry
Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts);
words.Add(scratchInts, currentOrds);
}
try
{
sorted.Delete();
}
catch
{
// ignore
}
}
private class ComparerAnonymousInnerClassHelper : IComparer<BytesRef>
{
private readonly Dictionary outerInstance;
public ComparerAnonymousInnerClassHelper(Dictionary outerInstance)
{
this.outerInstance = outerInstance;
scratch1 = new BytesRef();
scratch2 = new BytesRef();
}
internal BytesRef scratch1;
internal BytesRef scratch2;
public virtual int Compare(BytesRef o1, BytesRef o2)
{
scratch1.Bytes = o1.Bytes;
scratch1.Offset = o1.Offset;
scratch1.Length = o1.Length;
for (int i = scratch1.Length - 1; i >= 0; i--)
{
if (scratch1.Bytes[scratch1.Offset + i] == outerInstance.FLAG_SEPARATOR)
{
scratch1.Length = i;
break;
}
}
scratch2.Bytes = o2.Bytes;
scratch2.Offset = o2.Offset;
scratch2.Length = o2.Length;
for (int i = scratch2.Length - 1; i >= 0; i--)
{
if (scratch2.Bytes[scratch2.Offset + i] == outerInstance.FLAG_SEPARATOR)
{
scratch2.Length = i;
break;
}
}
int cmp = scratch1.CompareTo(scratch2);
if (cmp == 0)
{
// tie break on whole row
return o1.CompareTo(o2);
}
else
{
return cmp;
}
}
}
internal static char[] DecodeFlags(BytesRef b)
{
if (b.Length == 0)
{
return CharsRef.EMPTY_CHARS;
}
int len = (int)((uint)b.Length >> 1);
char[] flags = new char[len];
int upto = 0;
int end = b.Offset + b.Length;
for (int i = b.Offset; i < end; i += 2)
{
flags[upto++] = (char)((b.Bytes[i] << 8) | (b.Bytes[i + 1] & 0xff));
}
return flags;
}
internal static void EncodeFlags(BytesRef b, char[] flags)
{
int len = flags.Length << 1;
b.Grow(len);
b.Length = len;
int upto = b.Offset;
for (int i = 0; i < flags.Length; i++)
{
int flag = flags[i];
b.Bytes[upto++] = (byte)((flag >> 8) & 0xff);
b.Bytes[upto++] = (byte)(flag & 0xff);
}
}
private void ParseAlias(string line)
{
string[] ruleArgs = whitespacePattern.Split(line).TrimEnd();
if (aliases == null)
{
//first line should be the aliases count
int count = int.Parse(ruleArgs[1], CultureInfo.InvariantCulture);
aliases = new string[count];
}
else
{
// an alias can map to no flags
string aliasValue = ruleArgs.Length == 1 ? "" : ruleArgs[1];
aliases[aliasCount++] = aliasValue;
}
}
private string GetAliasValue(int id)
{
try
{
return aliases[id - 1];
}
catch (System.IndexOutOfRangeException ex)
{
throw new System.ArgumentException("Bad flag alias number:" + id, ex);
}
}
/// <summary>
/// Abstraction of the process of parsing flags taken from the affix and dic files
/// </summary>
internal abstract class FlagParsingStrategy
{
/// <summary>
/// Parses the given <see cref="string"/> into a single flag
/// </summary>
/// <param name="rawFlag"> <see cref="string"/> to parse into a flag </param>
/// <returns> Parsed flag </returns>
internal virtual char ParseFlag(string rawFlag)
{
char[] flags = ParseFlags(rawFlag);
if (flags.Length != 1)
{
throw new System.ArgumentException("expected only one flag, got: " + rawFlag);
}
return flags[0];
}
/// <summary>
/// Parses the given <see cref="string"/> into multiple flags
/// </summary>
/// <param name="rawFlags"> <see cref="string"/> to parse into flags </param>
/// <returns> Parsed flags </returns>
internal abstract char[] ParseFlags(string rawFlags);
}
/// <summary>
/// Simple implementation of <see cref="FlagParsingStrategy"/> that treats the chars in each <see cref="string"/> as a individual flags.
/// Can be used with both the ASCII and UTF-8 flag types.
/// </summary>
private class SimpleFlagParsingStrategy : FlagParsingStrategy
{
internal override char[] ParseFlags(string rawFlags)
{
return rawFlags.ToCharArray();
}
}
/// <summary>
/// Implementation of <see cref="FlagParsingStrategy"/> that assumes each flag is encoded in its numerical form. In the case
/// of multiple flags, each number is separated by a comma.
/// </summary>
private class NumFlagParsingStrategy : FlagParsingStrategy
{
internal override char[] ParseFlags(string rawFlags)
{
string[] rawFlagParts = rawFlags.Trim().Split(',').TrimEnd();
char[] flags = new char[rawFlagParts.Length];
int upto = 0;
for (int i = 0; i < rawFlagParts.Length; i++)
{
// note, removing the trailing X/leading I for nepali... what is the rule here?!
string replacement = Regex.Replace(rawFlagParts[i], "[^0-9]", "");
// note, ignoring empty flags (this happens in danish, for example)
if (replacement.Length == 0)
{
continue;
}
flags[upto++] = (char)int.Parse(replacement, CultureInfo.InvariantCulture);
}
if (upto < flags.Length)
{
flags = Arrays.CopyOf(flags, upto);
}
return flags;
}
}
/// <summary>
/// Implementation of <see cref="FlagParsingStrategy"/> that assumes each flag is encoded as two ASCII characters whose codes
/// must be combined into a single character.
///
/// TODO (rmuir) test
/// </summary>
private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy
{
internal override char[] ParseFlags(string rawFlags)
{
if (rawFlags.Length == 0)
{
return new char[0];
}
StringBuilder builder = new StringBuilder();
if (rawFlags.Length % 2 == 1)
{
throw new System.ArgumentException("Invalid flags (should be even number of characters): " + rawFlags);
}
for (int i = 0; i < rawFlags.Length; i += 2)
{
char cookedFlag = (char)((int)rawFlags[i] + (int)rawFlags[i + 1]);
builder.Append(cookedFlag);
}
char[] flags = new char[builder.Length];
builder.CopyTo(0, flags, 0, builder.Length);
return flags;
}
}
internal static bool HasFlag(char[] flags, char flag)
{
return Array.BinarySearch(flags, flag) >= 0;
}
internal virtual string CleanInput(string input, StringBuilder reuse)
{
reuse.Length = 0;
for (int i = 0; i < input.Length; i++)
{
char ch = input[i];
if (ignore != null && Array.BinarySearch(ignore, ch) >= 0)
{
continue;
}
if (ignoreCase && iconv == null)
{
// if we have no input conversion mappings, do this on-the-fly
ch = char.ToLowerInvariant(ch);
}
reuse.Append(ch);
}
if (iconv != null)
{
try
{
ApplyMappings(iconv, reuse);
}
catch (IOException bogus)
{
throw new Exception(bogus.Message, bogus);
}
if (ignoreCase)
{
for (int i = 0; i < reuse.Length; i++)
{
reuse[i] = char.ToLowerInvariant(reuse[i]);
}
}
}
return reuse.ToString();
}
// TODO: this could be more efficient!
internal static void ApplyMappings(FST<CharsRef> fst, StringBuilder sb)
{
FST.BytesReader bytesReader = fst.GetBytesReader();
FST.Arc<CharsRef> firstArc = fst.GetFirstArc(new FST.Arc<CharsRef>());
CharsRef NO_OUTPUT = fst.Outputs.NoOutput;
// temporary stuff
FST.Arc<CharsRef> arc = new FST.Arc<CharsRef>();
int longestMatch;
CharsRef longestOutput;
for (int i = 0; i < sb.Length; i++)
{
arc.CopyFrom(firstArc);
CharsRef output = NO_OUTPUT;
longestMatch = -1;
longestOutput = null;
for (int j = i; j < sb.Length; j++)
{
char ch = sb[j];
if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null)
{
break;
}
else
{
output = fst.Outputs.Add(output, arc.Output);
}
if (arc.IsFinal)
{
longestOutput = fst.Outputs.Add(output, arc.NextFinalOutput);
longestMatch = j;
}
}
if (longestMatch >= 0)
{
sb.Remove(i, longestMatch + 1 - i);
sb.Insert(i, longestOutput);
i += (longestOutput.Length - 1);
}
}
}
}
}