src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs - lucenenet - Git at Google

 // commons-codec version compatibility level: 1.10
 using Lucene.Net.Support;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Reflection;
 using System.Text;
 using System.Text.RegularExpressions;

 namespace Lucene.Net.Analysis.Phonetic.Language
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Encodes a string into a Daitch-Mokotoff Soundex value.
     /// </summary>
     /// <remarks>
     /// The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
     /// accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
     /// <para/>
     /// The main differences compared to the other soundex variants are:
     /// <list type="bullet">
     ///     <item><description>coded names are 6 digits long</description></item>
     ///     <item><description>the initial character of the name is coded</description></item>
     ///     <item><description>rules to encoded multi-character n-grams</description></item>
     ///     <item><description>multiple possible encodings for the same name (branching)</description></item>
     /// </list>
     /// <para/>
     /// This implementation supports branching, depending on the used method:
     /// <list type="bullet">
     ///     <item><term><see cref="Encode(string)"/></term><description>branching disabled, only the first code will be returned</description></item>
     ///     <item><term><see cref="GetSoundex(string)"/></term><description>branching enabled, all codes will be returned, separated by '|'</description></item>
     /// </list>
     /// <para/>
     /// Note: this implementation has additional branching rules compared to the original description of the algorithm. The
     /// rules can be customized by overriding the default rules contained in the resource file
     /// <c>Lucene.Net.Analysis.Phonetic.Language.dmrules.txt</c>.
     /// <para/>
     /// This class is thread-safe.
     /// <para/>
     /// See: <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
     /// <para/>
     /// See: <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
     /// <para/>
     /// since 1.10
     /// </remarks>
     /// <seealso cref="Soundex"/>
     public class DaitchMokotoffSoundex : IStringEncoder
     {
         /// <summary>
         /// Inner class representing a branch during DM soundex encoding.
         /// </summary>
         private sealed class Branch
         {
             private readonly StringBuilder builder;
             private string cachedString;
             private string lastReplacement;

             internal Branch()
             {
                 builder = new StringBuilder();
                 lastReplacement = null;
                 cachedString = null;
             }

             /// <summary>
             /// Creates a new branch, identical to this branch.
             /// </summary>
             /// <returns>A new, identical branch.</returns>
             public Branch CreateBranch()
             {
                 Branch branch = new Branch();
                 branch.builder.Append(ToString());
                 branch.lastReplacement = this.lastReplacement;
                 return branch;
             }

             public override bool Equals(object other)
             {
                 if (this == other)
                 {
                     return true;
                 }
                 if (!(other is Branch))
                 {
                     return false;
                 }

                 return ToString().Equals(((Branch)other).ToString(), StringComparison.Ordinal);
             }

             /// <summary>
             /// Finish this branch by appending '0's until the maximum code length has been reached.
             /// </summary>
             public void Finish()
             {
                 while (builder.Length < MAX_LENGTH)
                 {
                     builder.Append('0');
                     cachedString = null;
                 }
             }

             public override int GetHashCode()
             {
                 return ToString().GetHashCode();
             }

             /// <summary>
             /// Process the next replacement to be added to this branch.
             /// </summary>
             /// <param name="replacement">The next replacement to append.</param>
             /// <param name="forceAppend">Indicates if the default processing shall be overridden.</param>
             public void ProcessNextReplacement(string replacement, bool forceAppend)
             {
                 bool append = lastReplacement == null || !lastReplacement.EndsWith(replacement, StringComparison.Ordinal) || forceAppend;

                 if (append && builder.Length < MAX_LENGTH)
                 {
                     builder.Append(replacement);
                     // remove all characters after the maximum length
                     if (builder.Length > MAX_LENGTH)
                     {
                         //builder.delete(MAX_LENGTH, builder.Length);
                         builder.Remove(MAX_LENGTH, builder.Length - MAX_LENGTH);
                     }
                     cachedString = null;
                 }

                 lastReplacement = replacement;
             }

             public override string ToString()
             {
                 if (cachedString == null)
                 {
                     cachedString = builder.ToString();
                 }
                 return cachedString;
             }
         }

         /// <summary>
         /// Inner class for storing rules.
         /// </summary>
         private sealed class Rule
         {
             private readonly static Regex PIPE = new Regex(@"\|", RegexOptions.Compiled);

             private readonly string pattern;
             private readonly string[] replacementAtStart;
             private readonly string[] replacementBeforeVowel;
             private readonly string[] replacementDefault;

             internal Rule(string pattern, string replacementAtStart, string replacementBeforeVowel,
                     string replacementDefault)
             {
                 this.pattern = pattern;
                 this.replacementAtStart = PIPE.Split(replacementAtStart);
                 this.replacementBeforeVowel = PIPE.Split(replacementBeforeVowel);
                 this.replacementDefault = PIPE.Split(replacementDefault);
             }

             // LUCENENET specific - need read access to pattern
             public string Pattern
             {
                 get { return pattern; }
             }

             public int PatternLength
             {
                 get { return pattern.Length; }
             }

             public string[] GetReplacements(string context, bool atStart)
             {
                 if (atStart)
                 {
                     return replacementAtStart;
                 }

                 int nextIndex = PatternLength;
                 bool nextCharIsVowel = nextIndex < context.Length ? IsVowel(context[nextIndex]) : false;
                 if (nextCharIsVowel)
                 {
                     return replacementBeforeVowel;
                 }

                 return replacementDefault;
             }

             private bool IsVowel(char ch)
             {
                 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
             }

             public bool Matches(string context)
             {
                 return context.StartsWith(pattern, StringComparison.Ordinal);
             }

             public override string ToString()
             {
                 return string.Format("{0}=({1},{2},{3})", pattern, Collections.ToString(replacementAtStart),
                     Collections.ToString(replacementBeforeVowel), Collections.ToString(replacementDefault));
             }
         }

         private static readonly string COMMENT = "//";
         private static readonly string DOUBLE_QUOTE = "\"";

         private static readonly string MULTILINE_COMMENT_END = "*/";

         private static readonly string MULTILINE_COMMENT_START = "/*";

         /// <summary>The resource file containing the replacement and folding rules</summary>
         private static readonly string RESOURCE_FILE = "dmrules.txt";

         /// <summary>The code length of a DM soundex value.</summary>
         private static readonly int MAX_LENGTH = 6;

         /// <summary>Transformation rules indexed by the first character of their pattern.</summary>
         private static readonly IDictionary<char, IList<Rule>> RULES = new Dictionary<char, IList<Rule>>();

         /// <summary>Folding rules.</summary>
         private static readonly IDictionary<char, char> FOLDINGS = new Dictionary<char, char>();

         private static readonly Regex WHITESPACE = new Regex(@"\s+", RegexOptions.Compiled);

         private class DaitchMokotoffRuleComparer : IComparer<Rule>
         {
             public int Compare(Rule rule1, Rule rule2)
             {
                 return rule2.PatternLength - rule1.PatternLength;
             }
         }

         static DaitchMokotoffSoundex()
         {
             Stream rulesIS = typeof(DaitchMokotoffSoundex).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(DaitchMokotoffSoundex), RESOURCE_FILE);
             if (rulesIS == null)
             {
                 throw new ArgumentException("Unable to load resource: " + RESOURCE_FILE);
             }

             using (TextReader scanner = new StreamReader(rulesIS, Encoding.UTF8))
             {
                 ParseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
             }

             // sort RULES by pattern length in descending order
             foreach (var rule in RULES)
             {
                 IList<Rule> ruleList = rule.Value;
                 ruleList.Sort(new DaitchMokotoffRuleComparer());
             }
         }

         private static void ParseRules(TextReader scanner, string location,
             IDictionary<char, IList<Rule>> ruleMapping, IDictionary<char, char> asciiFoldings)
         {
             int currentLine = 0;
             bool inMultilineComment = false;

             string rawLine;
             while ((rawLine = scanner.ReadLine()) != null)
             {
                 currentLine++;
                 string line = rawLine;

                 if (inMultilineComment)
                 {
                     if (line.EndsWith(MULTILINE_COMMENT_END, StringComparison.Ordinal))
                     {
                         inMultilineComment = false;
                     }
                     continue;
                 }

                 if (line.StartsWith(MULTILINE_COMMENT_START, StringComparison.Ordinal))
                 {
                     inMultilineComment = true;
                 }
                 else
                 {
                     // discard comments
                     int cmtI = line.IndexOf(COMMENT, StringComparison.Ordinal);
                     if (cmtI >= 0)
                     {
                         line = line.Substring(0, cmtI - 0);
                     }

                     // trim leading-trailing whitespace
                     line = line.Trim();

                     if (line.Length == 0)
                     {
                         continue; // empty lines can be safely skipped
                     }

                     if (line.Contains("="))
                     {
                         // folding
                         string[] parts = line.Split('=').TrimEnd();
                         if (parts.Length != 2)
                         {
                             throw new ArgumentException("Malformed folding statement split into " + parts.Length +
                                     " parts: " + rawLine + " in " + location);
                         }
                         else
                         {
                             string leftCharacter = parts[0];
                             string rightCharacter = parts[1];

                             if (leftCharacter.Length != 1 || rightCharacter.Length != 1)
                             {
                                 throw new ArgumentException("Malformed folding statement - " +
                                         "patterns are not single characters: " + rawLine + " in " + location);
                             }

                             asciiFoldings[leftCharacter[0]] = rightCharacter[0];
                         }
                     }
                     else
                     {
                         // rule
                         string[] parts = WHITESPACE.Split(line).TrimEnd();
                         if (parts.Length != 4)
                         {
                             throw new ArgumentException("Malformed rule statement split into " + parts.Length +
                                     " parts: " + rawLine + " in " + location);
                         }
                         else
                         {
                             try
                             {
                                 string pattern = StripQuotes(parts[0]);
                                 string replacement1 = StripQuotes(parts[1]);
                                 string replacement2 = StripQuotes(parts[2]);
                                 string replacement3 = StripQuotes(parts[3]);

                                 Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
                                 char patternKey = r.Pattern[0];
                                 IList<Rule> rules;
                                 if (!ruleMapping.TryGetValue(patternKey, out rules) || rules == null)
                                 {
                                     rules = new List<Rule>();
                                     ruleMapping[patternKey] = rules;
                                 }
                                 rules.Add(r);
                             }
                             catch (ArgumentException e)
                             {
                                 throw new InvalidOperationException(
                                         "Problem parsing line '" + currentLine + "' in " + location, e);
                             }
                         }
                     }
                 }
             }
         }

         private static string StripQuotes(string str)
         {
             if (str.StartsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
             {
                 str = str.Substring(1);
             }

             if (str.EndsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
             {
                 str = str.Substring(0, str.Length - 1);
             }

             return str;
         }

         /// <summary>Whether to use ASCII folding prior to encoding.</summary>
         private readonly bool folding;

         /// <summary>
         /// Creates a new instance with ASCII-folding enabled.
         /// </summary>
         public DaitchMokotoffSoundex()
             : this(true)
         {
         }

         /// <summary>
         /// Creates a new instance.
         /// <para/>
         /// With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
         /// è -&gt; e.
         /// </summary>
         /// <param name="folding">If ASCII-folding shall be performed before encoding.</param>
         public DaitchMokotoffSoundex(bool folding)
         {
             this.folding = folding;
         }

         /// <summary>
         /// Performs a cleanup of the input string before the actual soundex transformation.
         /// <para/>
         /// Removes all whitespace characters and performs ASCII folding if enabled.
         /// </summary>
         /// <param name="input">The input string to cleanup.</param>
         /// <returns>A cleaned up string.</returns>
         private string Cleanup(string input)
         {
             StringBuilder sb = new StringBuilder();
             foreach (char c in input.ToCharArray())
             {
                 char ch = c;
                 if (char.IsWhiteSpace(ch))
                 {
                     continue;
                 }

                 ch = char.ToLowerInvariant(ch);
                 if (folding && FOLDINGS.TryGetValue(ch, out char newChar))
                 {
                     ch = newChar;
                 }
                 sb.Append(ch);
             }
             return sb.ToString();
         }

         // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
         //**
         // * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
         // * <p>
         // * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
         // * EncoderException if the supplied object is not of type java.lang.String.
         // * </p>
         // *
         // * @see #soundex(String)
         // *
         // * @param obj
         // *            Object to encode
         // * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
         // *         supplied.
         // * @throws EncoderException
         // *             if the parameter supplied is not of type java.lang.String
         // * @throws IllegalArgumentException
         // *             if a character is not mapped
         // */
         //@Override
         //    public Object encode(object obj)
         //{
         //        if (!(obj instanceof String)) {
         //        throw new EncoderException(
         //                "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
         //    }
         //        return encode((String) obj);
         //}

         /// <summary>
         /// Encodes a string using the Daitch-Mokotoff soundex algorithm without branching.
         /// </summary>
         /// <param name="source">A string to encode.</param>
         /// <returns>A DM Soundex code corresponding to the string supplied.</returns>
         /// <exception cref="ArgumentException">If a character is not mapped.</exception>
         /// <seealso cref="GetSoundex(string)"/>
         public virtual string Encode(string source)
         {
             if (source == null)
             {
                 return null;
             }
             return GetSoundex(source, false)[0];
         }

         /// <summary>
         /// Encodes a string using the Daitch-Mokotoff soundex algorithm with branching.
         /// <para/>
         /// In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
         /// separated by '|'.
         /// <para/>
         /// Example: the name "AUERBACH" is encoded as both
         /// <list type="bullet">
         ///     <item><description>097400</description></item>
         ///     <item><description>097500</description></item>
         /// </list>
         /// <para/>
         /// Thus the result will be "097400|097500".
         /// </summary>
         /// <param name="source">A string to encode.</param>
         /// <returns>A string containing a set of DM Soundex codes corresponding to the string supplied.</returns>
         /// <exception cref="ArgumentException">If a character is not mapped.</exception>
         public virtual string GetSoundex(string source)
         {
             string[] branches = GetSoundex(source, true);
             StringBuilder sb = new StringBuilder();
             int index = 0;
             foreach (string branch in branches)
             {
                 sb.Append(branch);
                 if (++index < branches.Length)
                 {
                     sb.Append('|');
                 }
             }
             return sb.ToString();
         }

         /// <summary>
         /// Perform the actual DM Soundex algorithm on the input string.
         /// </summary>
         /// <param name="source">A string to encode.</param>
         /// <param name="branching">If branching shall be performed.</param>
         /// <returns>A string array containing all DM Soundex codes corresponding to the string supplied depending on the selected branching mode.</returns>
         /// <exception cref="ArgumentException">If a character is not mapped.</exception>
         private string[] GetSoundex(string source, bool branching)
         {
             if (source == null)
             {
                 return null;
             }

             string input = Cleanup(source);

             // LinkedHashSet preserves input order. In .NET we can use List for that purpose.
             IList<Branch> currentBranches = new List<Branch>();
             currentBranches.Add(new Branch());

             char lastChar = '\0';
             for (int index = 0; index < input.Length; index++)
             {
                 char ch = input[index];

                 // ignore whitespace inside a name
                 if (char.IsWhiteSpace(ch))
                 {
                     continue;
                 }

                 string inputContext = input.Substring(index);
                 IList<Rule> rules;
                 if (!RULES.TryGetValue(ch, out rules) || rules == null)
                 {
                     continue;
                 }

                 // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
                 IList<Branch> nextBranches = branching ? new List<Branch>() : Collections.EmptyList<Branch>();

                 foreach (Rule rule in rules)
                 {
                     if (rule.Matches(inputContext))
                     {
                         if (branching)
                         {
                             nextBranches.Clear();
                         }
                         string[] replacements = rule.GetReplacements(inputContext, lastChar == '\0');
                         bool branchingRequired = replacements.Length > 1 && branching;

                         foreach (Branch branch in currentBranches)
                         {
                             foreach (string nextReplacement in replacements)
                             {
                                 // if we have multiple replacements, always create a new branch
                                 Branch nextBranch = branchingRequired ? branch.CreateBranch() : branch;

                                 // special rule: occurrences of mn or nm are treated differently
                                 bool force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');

                                 nextBranch.ProcessNextReplacement(nextReplacement, force);

                                 if (branching)
                                 {
                                     if (!nextBranches.Contains(nextBranch))
                                     {
                                         nextBranches.Add(nextBranch);
                                     }
                                 }
                                 else
                                 {
                                     break;
                                 }
                             }
                         }

                         if (branching)
                         {
                             currentBranches.Clear();
                             currentBranches.AddRange(nextBranches);
                         }
                         index += rule.PatternLength - 1;
                         break;
                     }
                 }

                 lastChar = ch;
             }

             string[] result = new string[currentBranches.Count];
             int idx = 0;
             foreach (Branch branch in currentBranches)
             {
                 branch.Finish();
                 result[idx++] = branch.ToString();
             }

             return result;
         }
     }
 }
	// commons-codec version compatibility level: 1.10
	using Lucene.Net.Support;
	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Reflection;
	using System.Text;
	using System.Text.RegularExpressions;

	namespace Lucene.Net.Analysis.Phonetic.Language
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Encodes a string into a Daitch-Mokotoff Soundex value.
	/// </summary>
	/// <remarks>
	/// The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
	/// accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
	/// <para/>
	/// The main differences compared to the other soundex variants are:
	/// <list type="bullet">
	/// <item><description>coded names are 6 digits long</description></item>
	/// <item><description>the initial character of the name is coded</description></item>
	/// <item><description>rules to encoded multi-character n-grams</description></item>
	/// <item><description>multiple possible encodings for the same name (branching)</description></item>
	/// </list>
	/// <para/>
	/// This implementation supports branching, depending on the used method:
	/// <list type="bullet">
	/// <item><term><see cref="Encode(string)"/></term><description>branching disabled, only the first code will be returned</description></item>
	/// <item><term><see cref="GetSoundex(string)"/></term><description>branching enabled, all codes will be returned, separated by '\|'</description></item>
	/// </list>
	/// <para/>
	/// Note: this implementation has additional branching rules compared to the original description of the algorithm. The
	/// rules can be customized by overriding the default rules contained in the resource file
	/// <c>Lucene.Net.Analysis.Phonetic.Language.dmrules.txt</c>.
	/// <para/>
	/// This class is thread-safe.
	/// <para/>
	/// See: <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
	/// <para/>
	/// See: <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
	/// <para/>
	/// since 1.10
	/// </remarks>
	/// <seealso cref="Soundex"/>
	public class DaitchMokotoffSoundex : IStringEncoder
	{
	/// <summary>
	/// Inner class representing a branch during DM soundex encoding.
	/// </summary>
	private sealed class Branch
	{
	private readonly StringBuilder builder;
	private string cachedString;
	private string lastReplacement;

	internal Branch()
	{
	builder = new StringBuilder();
	lastReplacement = null;
	cachedString = null;
	}

	/// <summary>
	/// Creates a new branch, identical to this branch.
	/// </summary>
	/// <returns>A new, identical branch.</returns>
	public Branch CreateBranch()
	{
	Branch branch = new Branch();
	branch.builder.Append(ToString());
	branch.lastReplacement = this.lastReplacement;
	return branch;
	}

	public override bool Equals(object other)
	{
	if (this == other)
	{
	return true;
	}
	if (!(other is Branch))
	{
	return false;
	}

	return ToString().Equals(((Branch)other).ToString(), StringComparison.Ordinal);
	}

	/// <summary>
	/// Finish this branch by appending '0's until the maximum code length has been reached.
	/// </summary>
	public void Finish()
	{
	while (builder.Length < MAX_LENGTH)
	{
	builder.Append('0');
	cachedString = null;
	}
	}

	public override int GetHashCode()
	{
	return ToString().GetHashCode();
	}

	/// <summary>
	/// Process the next replacement to be added to this branch.
	/// </summary>
	/// <param name="replacement">The next replacement to append.</param>
	/// <param name="forceAppend">Indicates if the default processing shall be overridden.</param>
	public void ProcessNextReplacement(string replacement, bool forceAppend)
	{
	bool append = lastReplacement == null \|\| !lastReplacement.EndsWith(replacement, StringComparison.Ordinal) \|\| forceAppend;

	if (append && builder.Length < MAX_LENGTH)
	{
	builder.Append(replacement);
	// remove all characters after the maximum length
	if (builder.Length > MAX_LENGTH)
	{
	//builder.delete(MAX_LENGTH, builder.Length);
	builder.Remove(MAX_LENGTH, builder.Length - MAX_LENGTH);
	}
	cachedString = null;
	}

	lastReplacement = replacement;
	}

	public override string ToString()
	{
	if (cachedString == null)
	{
	cachedString = builder.ToString();
	}
	return cachedString;
	}
	}

	/// <summary>
	/// Inner class for storing rules.
	/// </summary>
	private sealed class Rule
	{
	private readonly static Regex PIPE = new Regex(@"\\|", RegexOptions.Compiled);

	private readonly string pattern;
	private readonly string[] replacementAtStart;
	private readonly string[] replacementBeforeVowel;
	private readonly string[] replacementDefault;

	internal Rule(string pattern, string replacementAtStart, string replacementBeforeVowel,
	string replacementDefault)
	{
	this.pattern = pattern;
	this.replacementAtStart = PIPE.Split(replacementAtStart);
	this.replacementBeforeVowel = PIPE.Split(replacementBeforeVowel);
	this.replacementDefault = PIPE.Split(replacementDefault);
	}

	// LUCENENET specific - need read access to pattern
	public string Pattern
	{
	get { return pattern; }
	}

	public int PatternLength
	{
	get { return pattern.Length; }
	}

	public string[] GetReplacements(string context, bool atStart)
	{
	if (atStart)
	{
	return replacementAtStart;
	}

	int nextIndex = PatternLength;
	bool nextCharIsVowel = nextIndex < context.Length ? IsVowel(context[nextIndex]) : false;
	if (nextCharIsVowel)
	{
	return replacementBeforeVowel;
	}

	return replacementDefault;
	}

	private bool IsVowel(char ch)
	{
	return ch == 'a' \|\| ch == 'e' \|\| ch == 'i' \|\| ch == 'o' \|\| ch == 'u';
	}

	public bool Matches(string context)
	{
	return context.StartsWith(pattern, StringComparison.Ordinal);
	}

	public override string ToString()
	{
	return string.Format("{0}=({1},{2},{3})", pattern, Collections.ToString(replacementAtStart),
	Collections.ToString(replacementBeforeVowel), Collections.ToString(replacementDefault));
	}
	}

	private static readonly string COMMENT = "//";
	private static readonly string DOUBLE_QUOTE = "\"";

	private static readonly string MULTILINE_COMMENT_END = "*/";

	private static readonly string MULTILINE_COMMENT_START = "/*";

	/// <summary>The resource file containing the replacement and folding rules</summary>
	private static readonly string RESOURCE_FILE = "dmrules.txt";

	/// <summary>The code length of a DM soundex value.</summary>
	private static readonly int MAX_LENGTH = 6;

	/// <summary>Transformation rules indexed by the first character of their pattern.</summary>
	private static readonly IDictionary<char, IList<Rule>> RULES = new Dictionary<char, IList<Rule>>();

	/// <summary>Folding rules.</summary>
	private static readonly IDictionary<char, char> FOLDINGS = new Dictionary<char, char>();

	private static readonly Regex WHITESPACE = new Regex(@"\s+", RegexOptions.Compiled);

	private class DaitchMokotoffRuleComparer : IComparer<Rule>
	{
	public int Compare(Rule rule1, Rule rule2)
	{
	return rule2.PatternLength - rule1.PatternLength;
	}
	}

	static DaitchMokotoffSoundex()
	{
	Stream rulesIS = typeof(DaitchMokotoffSoundex).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(DaitchMokotoffSoundex), RESOURCE_FILE);
	if (rulesIS == null)
	{
	throw new ArgumentException("Unable to load resource: " + RESOURCE_FILE);
	}

	using (TextReader scanner = new StreamReader(rulesIS, Encoding.UTF8))
	{
	ParseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
	}

	// sort RULES by pattern length in descending order
	foreach (var rule in RULES)
	{
	IList<Rule> ruleList = rule.Value;
	ruleList.Sort(new DaitchMokotoffRuleComparer());
	}
	}

	private static void ParseRules(TextReader scanner, string location,
	IDictionary<char, IList<Rule>> ruleMapping, IDictionary<char, char> asciiFoldings)
	{
	int currentLine = 0;
	bool inMultilineComment = false;

	string rawLine;
	while ((rawLine = scanner.ReadLine()) != null)
	{
	currentLine++;
	string line = rawLine;

	if (inMultilineComment)
	{
	if (line.EndsWith(MULTILINE_COMMENT_END, StringComparison.Ordinal))
	{
	inMultilineComment = false;
	}
	continue;
	}

	if (line.StartsWith(MULTILINE_COMMENT_START, StringComparison.Ordinal))
	{
	inMultilineComment = true;
	}
	else
	{
	// discard comments
	int cmtI = line.IndexOf(COMMENT, StringComparison.Ordinal);
	if (cmtI >= 0)
	{
	line = line.Substring(0, cmtI - 0);
	}

	// trim leading-trailing whitespace
	line = line.Trim();

	if (line.Length == 0)
	{
	continue; // empty lines can be safely skipped
	}

	if (line.Contains("="))
	{
	// folding
	string[] parts = line.Split('=').TrimEnd();
	if (parts.Length != 2)
	{
	throw new ArgumentException("Malformed folding statement split into " + parts.Length +
	" parts: " + rawLine + " in " + location);
	}
	else
	{
	string leftCharacter = parts[0];
	string rightCharacter = parts[1];

	if (leftCharacter.Length != 1 \|\| rightCharacter.Length != 1)
	{
	throw new ArgumentException("Malformed folding statement - " +
	"patterns are not single characters: " + rawLine + " in " + location);
	}

	asciiFoldings[leftCharacter[0]] = rightCharacter[0];
	}
	}
	else
	{
	// rule
	string[] parts = WHITESPACE.Split(line).TrimEnd();
	if (parts.Length != 4)
	{
	throw new ArgumentException("Malformed rule statement split into " + parts.Length +
	" parts: " + rawLine + " in " + location);
	}
	else
	{
	try
	{
	string pattern = StripQuotes(parts[0]);
	string replacement1 = StripQuotes(parts[1]);
	string replacement2 = StripQuotes(parts[2]);
	string replacement3 = StripQuotes(parts[3]);

	Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
	char patternKey = r.Pattern[0];
	IList<Rule> rules;
	if (!ruleMapping.TryGetValue(patternKey, out rules) \|\| rules == null)
	{
	rules = new List<Rule>();
	ruleMapping[patternKey] = rules;
	}
	rules.Add(r);
	}
	catch (ArgumentException e)
	{
	throw new InvalidOperationException(
	"Problem parsing line '" + currentLine + "' in " + location, e);
	}
	}
	}
	}
	}
	}

	private static string StripQuotes(string str)
	{
	if (str.StartsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
	{
	str = str.Substring(1);
	}

	if (str.EndsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
	{
	str = str.Substring(0, str.Length - 1);
	}

	return str;
	}

	/// <summary>Whether to use ASCII folding prior to encoding.</summary>
	private readonly bool folding;

	/// <summary>
	/// Creates a new instance with ASCII-folding enabled.
	/// </summary>
	public DaitchMokotoffSoundex()
	: this(true)
	{
	}

	/// <summary>
	/// Creates a new instance.
	/// <para/>
	/// With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
	/// è -> e.
	/// </summary>
	/// <param name="folding">If ASCII-folding shall be performed before encoding.</param>
	public DaitchMokotoffSoundex(bool folding)
	{
	this.folding = folding;
	}

	/// <summary>
	/// Performs a cleanup of the input string before the actual soundex transformation.
	/// <para/>
	/// Removes all whitespace characters and performs ASCII folding if enabled.
	/// </summary>
	/// <param name="input">The input string to cleanup.</param>
	/// <returns>A cleaned up string.</returns>
	private string Cleanup(string input)
	{
	StringBuilder sb = new StringBuilder();
	foreach (char c in input.ToCharArray())
	{
	char ch = c;
	if (char.IsWhiteSpace(ch))
	{
	continue;
	}

	ch = char.ToLowerInvariant(ch);
	if (folding && FOLDINGS.TryGetValue(ch, out char newChar))
	{
	ch = newChar;
	}
	sb.Append(ch);
	}
	return sb.ToString();
	}

	// LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
	//**
	// * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
	// * <p>
	// * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
	// * EncoderException if the supplied object is not of type java.lang.String.
	// * </p>
	// *
	// * @see #soundex(String)
	// *
	// * @param obj
	// * Object to encode
	// * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
	// * supplied.
	// * @throws EncoderException
	// * if the parameter supplied is not of type java.lang.String
	// * @throws IllegalArgumentException
	// * if a character is not mapped
	// */
	//@Override
	// public Object encode(object obj)
	//{
	// if (!(obj instanceof String)) {
	// throw new EncoderException(
	// "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
	// }
	// return encode((String) obj);
	//}

	/// <summary>
	/// Encodes a string using the Daitch-Mokotoff soundex algorithm without branching.
	/// </summary>
	/// <param name="source">A string to encode.</param>
	/// <returns>A DM Soundex code corresponding to the string supplied.</returns>
	/// <exception cref="ArgumentException">If a character is not mapped.</exception>
	/// <seealso cref="GetSoundex(string)"/>
	public virtual string Encode(string source)
	{
	if (source == null)
	{
	return null;
	}
	return GetSoundex(source, false)[0];
	}

	/// <summary>
	/// Encodes a string using the Daitch-Mokotoff soundex algorithm with branching.
	/// <para/>
	/// In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
	/// separated by '\|'.
	/// <para/>
	/// Example: the name "AUERBACH" is encoded as both
	/// <list type="bullet">
	/// <item><description>097400</description></item>
	/// <item><description>097500</description></item>
	/// </list>
	/// <para/>
	/// Thus the result will be "097400\|097500".
	/// </summary>
	/// <param name="source">A string to encode.</param>
	/// <returns>A string containing a set of DM Soundex codes corresponding to the string supplied.</returns>
	/// <exception cref="ArgumentException">If a character is not mapped.</exception>
	public virtual string GetSoundex(string source)
	{
	string[] branches = GetSoundex(source, true);
	StringBuilder sb = new StringBuilder();
	int index = 0;
	foreach (string branch in branches)
	{
	sb.Append(branch);
	if (++index < branches.Length)
	{
	sb.Append('\|');
	}
	}
	return sb.ToString();
	}

	/// <summary>
	/// Perform the actual DM Soundex algorithm on the input string.
	/// </summary>
	/// <param name="source">A string to encode.</param>
	/// <param name="branching">If branching shall be performed.</param>
	/// <returns>A string array containing all DM Soundex codes corresponding to the string supplied depending on the selected branching mode.</returns>
	/// <exception cref="ArgumentException">If a character is not mapped.</exception>
	private string[] GetSoundex(string source, bool branching)
	{
	if (source == null)
	{
	return null;
	}

	string input = Cleanup(source);

	// LinkedHashSet preserves input order. In .NET we can use List for that purpose.
	IList<Branch> currentBranches = new List<Branch>();
	currentBranches.Add(new Branch());

	char lastChar = '\0';
	for (int index = 0; index < input.Length; index++)
	{
	char ch = input[index];

	// ignore whitespace inside a name
	if (char.IsWhiteSpace(ch))
	{
	continue;
	}

	string inputContext = input.Substring(index);
	IList<Rule> rules;
	if (!RULES.TryGetValue(ch, out rules) \|\| rules == null)
	{
	continue;
	}

	// use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
	IList<Branch> nextBranches = branching ? new List<Branch>() : Collections.EmptyList<Branch>();

	foreach (Rule rule in rules)
	{
	if (rule.Matches(inputContext))
	{
	if (branching)
	{
	nextBranches.Clear();
	}
	string[] replacements = rule.GetReplacements(inputContext, lastChar == '\0');
	bool branchingRequired = replacements.Length > 1 && branching;

	foreach (Branch branch in currentBranches)
	{
	foreach (string nextReplacement in replacements)
	{
	// if we have multiple replacements, always create a new branch
	Branch nextBranch = branchingRequired ? branch.CreateBranch() : branch;

	// special rule: occurrences of mn or nm are treated differently
	bool force = (lastChar == 'm' && ch == 'n') \|\| (lastChar == 'n' && ch == 'm');

	nextBranch.ProcessNextReplacement(nextReplacement, force);

	if (branching)
	{
	if (!nextBranches.Contains(nextBranch))
	{
	nextBranches.Add(nextBranch);
	}
	}
	else
	{
	break;
	}
	}
	}

	if (branching)
	{
	currentBranches.Clear();
	currentBranches.AddRange(nextBranches);
	}
	index += rule.PatternLength - 1;
	break;
	}
	}

	lastChar = ch;
	}

	string[] result = new string[currentBranches.Count];
	int idx = 0;
	foreach (Branch branch in currentBranches)
	{
	branch.Finish();
	result[idx++] = branch.ToString();
	}

	return result;
	}
	}
	}