src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs - lucenenet - Git at Google

 // commons-codec version compatibility level: 1.9
 using J2N.Collections.Generic.Extensions;
 using J2N.Text;
 using Lucene.Net.Support;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
 using System.Text;
 using System.Text.RegularExpressions;
 using JCG = J2N.Collections.Generic;

 namespace Lucene.Net.Analysis.Phonetic.Language.Bm
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Converts words into potential phonetic representations.
     /// </summary>
     /// <remarks>
     /// This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes
     /// into account the likely source language. Next, this phonetic representation is converted into a
     /// pan-European 'average' representation, allowing comparison between different versions of essentially
     /// the same word from different languages.
     /// <para/>
     /// This class is intentionally immutable and thread-safe.
     /// If you wish to alter the settings for a PhoneticEngine, you
     /// must make a new one with the updated settings.
     /// <para/>
     /// Ported from phoneticengine.php
     /// <para/>
     /// since 1.6
     /// </remarks>
     public class PhoneticEngine
     {
         internal Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled);

         /// <summary>
         /// Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside
         /// this package, and probably not outside the <see cref="PhoneticEngine"/> class.
         /// <para/>
         /// since 1.6
         /// </summary>
         internal sealed class PhonemeBuilder
         {
             /// <summary>
             /// An empty builder where all phonemes must come from some set of languages. This will contain a single
             /// phoneme of zero characters. This can then be appended to. This should be the only way to create a new
             /// phoneme from scratch.
             /// </summary>
             /// <param name="languages">The set of languages.</param>
             /// <returns>A new, empty phoneme builder.</returns>
             public static PhonemeBuilder Empty(LanguageSet languages)
             {
                 return new PhonemeBuilder(new Phoneme("", languages));
             }

             private readonly ISet<Phoneme> phonemes;

             private PhonemeBuilder(Phoneme phoneme)
             {
                 this.phonemes = new JCG.LinkedHashSet<Phoneme>
                 {
                     phoneme
                 };
             }

             internal PhonemeBuilder(ISet<Phoneme> phonemes)
             {
                 this.phonemes = phonemes;
             }

             /// <summary>
             /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
             /// </summary>
             /// <param name="str">The characters to append to the phonemes.</param>
             public void Append(ICharSequence str)
             {
                 foreach (Phoneme ph in this.phonemes)
                 {
                     ph.Append(str.ToString());
                 }
             }

             /// <summary>
             /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
             /// </summary>
             /// <param name="str">The characters to append to the phonemes.</param>
             // LUCENENET specific
             public void Append(string str)
             {
                 foreach (Phoneme ph in this.phonemes)
                 {
                     ph.Append(str);
                 }
             }

             /// <summary>
             /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
             /// </summary>
             /// <param name="str">The characters to append to the phonemes.</param>
             // LUCENENET specific
             public void Append(StringBuilder str)
             {
                 foreach (Phoneme ph in this.phonemes)
                 {
                     ph.Append(str.ToString());
                 }
             }

             /// <summary>
             /// Applies the given phoneme expression to all phonemes in this phoneme builder.
             /// <para/>
             /// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
             /// incompatible.
             /// </summary>
             /// <param name="phonemeExpr">The expression to apply.</param>
             /// <param name="maxPhonemes">The maximum number of phonemes to build up.</param>
             public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes)
             {
                 ISet<Phoneme> newPhonemes = new JCG.LinkedHashSet<Phoneme>(maxPhonemes);

                 //EXPR_continue:
                 foreach (Phoneme left in this.phonemes)
                 {
                     foreach (Phoneme right in phonemeExpr.Phonemes)
                     {
                         LanguageSet languages = left.Languages.RestrictTo(right.Languages);
                         if (!languages.IsEmpty)
                         {
                             Phoneme join = new Phoneme(left, right, languages);
                             if (newPhonemes.Count < maxPhonemes)
                             {
                                 newPhonemes.Add(join);
                                 if (newPhonemes.Count >= maxPhonemes)
                                 {
                                     goto EXPR_break;
                                 }
                             }
                         }
                     }
                 }
                 EXPR_break: { }

                 this.phonemes.Clear();
                 this.phonemes.UnionWith(newPhonemes);
             }

             /// <summary>
             /// Gets underlying phoneme set. Please don't mutate.
             /// </summary>
             public ISet<Phoneme> Phonemes => phonemes;

             /// <summary>
             /// Stringifies the phoneme set. This produces a single string of the strings of each phoneme,
             /// joined with a pipe. This is explicitly provided in place of <see cref="object.ToString()"/> as it is a potentially
             /// expensive operation, which should be avoided when debugging.
             /// </summary>
             /// <returns>The stringified phoneme set.</returns>
             public string MakeString()
             {
                 StringBuilder sb = new StringBuilder();

                 foreach (Phoneme ph in this.phonemes)
                 {
                     if (sb.Length > 0)
                     {
                         sb.Append("|");
                     }
                     sb.Append(ph.GetPhonemeText());
                 }

                 return sb.ToString();
             }
         }

         /// <summary>
         /// A function closure capturing the application of a list of rules to an input sequence at a particular offset.
         /// After invocation, the values <c>i</c> and <c>found</c> are updated. <c>i</c> points to the
         /// index of the next char in <c>input</c> that must be processed next (the input up to that index having been
         /// processed already), and <c>found</c> indicates if a matching rule was found or not. In the case where a
         /// matching rule was found, <c>phonemeBuilder</c> is replaced with a new builder containing the phonemes
         /// updated by the matching rule.
         /// <para/>
         /// Although this class is not thread-safe (it has mutable unprotected fields), it is not shared between threads
         /// as it is constructed as needed by the calling methods.
         /// <para/>
         /// since 1.6
         /// </summary>
         private sealed class RulesApplication
         {
             private readonly IDictionary<string, IList<Rule>> finalRules;
             private readonly string input;

             private readonly PhonemeBuilder phonemeBuilder;
             private int i;
             private readonly int maxPhonemes;
             private bool found;

             public RulesApplication(IDictionary<string, IList<Rule>> finalRules, string input,
                                     PhonemeBuilder phonemeBuilder, int i, int maxPhonemes)
             {
                 this.finalRules = finalRules ?? throw new ArgumentNullException(nameof(finalRules), "The finalRules argument must not be null");
                 this.phonemeBuilder = phonemeBuilder;
                 this.input = input;
                 this.i = i;
                 this.maxPhonemes = maxPhonemes;
             }

             public int I => i;

             public PhonemeBuilder PhonemeBuilder => phonemeBuilder;

             /// <summary>
             /// Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context
             /// and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no
             /// match, <c>i</c> is advanced one and the character is silently dropped from the phonetic spelling.
             /// </summary>
             /// <returns><c>this</c></returns>
             public RulesApplication Invoke()
             {
                 this.found = false;
                 int patternLength = 1;
                 if (this.finalRules.TryGetValue(input.Substring(i, patternLength), out IList<Rule> rules) && rules != null)
                 {
                     foreach (Rule rule in rules)
                     {
                         string pattern = rule.Pattern;
                         patternLength = pattern.Length;
                         if (rule.PatternAndContextMatches(this.input, this.i))
                         {
                             this.phonemeBuilder.Apply(rule.Phoneme, maxPhonemes);
                             this.found = true;
                             break;
                         }
                     }
                 }

                 if (!this.found)
                 {
                     patternLength = 1;
                 }

                 this.i += patternLength;
                 return this;
             }

             public bool IsFound => found;
         }

         private static readonly IDictionary<NameType, ISet<string>> NAME_PREFIXES = LoadNamePrefixes();

         private static IDictionary<NameType, ISet<string>> LoadNamePrefixes() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
         {
             return new Dictionary<NameType, ISet<string>>
             {
                 [NameType.ASHKENAZI] = new JCG.HashSet<string>() { "bar", "ben", "da", "de", "van", "von" }.AsReadOnly(),
                 [NameType.SEPHARDIC] = new JCG.HashSet<string>() { "al", "el", "da", "dal", "de", "del", "dela", "de la",
                                                               "della", "des", "di", "do", "dos", "du", "van", "von" }.AsReadOnly(),
                 [NameType.GENERIC] = new JCG.HashSet<string>() { "da", "dal", "de", "del", "dela", "de la", "della",
                                                           "des", "di", "do", "dos", "du", "van", "von" }.AsReadOnly()
             };
         }

         /// <summary>
         /// Joins some strings with an internal separator.
         /// </summary>
         /// <param name="strings">Strings to join.</param>
         /// <param name="sep">String to separate them with.</param>
         /// <returns>A single string consisting of each element of <paramref name="strings"/> interleaved by <paramref name="sep"/>.</returns>
         private static string Join(IEnumerable<string> strings, string sep)
         {
             StringBuilder sb = new StringBuilder();
             using (IEnumerator<string> si = strings.GetEnumerator())
             {
                 if (si.MoveNext())
                 {
                     sb.Append(si.Current);
                 }
                 while (si.MoveNext())
                 {
                     sb.Append(sep).Append(si.Current);
                 }
             }

             return sb.ToString();
         }

         private const int DEFAULT_MAX_PHONEMES = 20;

         private readonly Lang lang;

         private readonly NameType nameType;

         private readonly RuleType ruleType;

         private readonly bool concat;

         private readonly int maxPhonemes;

         /// <summary>
         /// Generates a new, fully-configured phonetic engine.
         /// </summary>
         /// <param name="nameType">The type of names it will use.</param>
         /// <param name="ruleType">The type of rules it will apply.</param>
         /// <param name="concat">If it will concatenate multiple encodings.</param>
         public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat)
             : this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES)
         {
         }

         /// <summary>
         /// Generates a new, fully-configured phonetic engine.
         /// <para/>
         /// since 1.7
         /// </summary>
         /// <param name="nameType">The type of names it will use.</param>
         /// <param name="ruleType">The type of rules it will apply.</param>
         /// <param name="concat">If it will concatenate multiple encodings.</param>
         /// <param name="maxPhonemes">The maximum number of phonemes that will be handled.</param>
         public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat,
                               int maxPhonemes)
         {
             if (ruleType == RuleType.RULES)
             {
                 throw new ArgumentException("ruleType must not be " + RuleType.RULES);
             }
             this.nameType = nameType;
             this.ruleType = ruleType;
             this.concat = concat;
             this.lang = Lang.GetInstance(nameType);
             this.maxPhonemes = maxPhonemes;
         }

         /// <summary>
         /// Applies the final rules to convert from a language-specific phonetic representation to a
         /// language-independent representation.
         /// </summary>
         /// <param name="phonemeBuilder">The current phonemes.</param>
         /// <param name="finalRules">The final rules to apply.</param>
         /// <returns>The resulting phonemes.</returns>
         private PhonemeBuilder ApplyFinalRules(PhonemeBuilder phonemeBuilder,
                                                IDictionary<string, IList<Rule>> finalRules)
         {
             if (finalRules == null)
             {
                 throw new ArgumentNullException("finalRules can not be null");
             }
             if (finalRules.Count == 0)
             {
                 return phonemeBuilder;
             }

             ISet<Phoneme> phonemes = new JCG.SortedSet<Phoneme>(Phoneme.COMPARER);

             foreach (Phoneme phoneme in phonemeBuilder.Phonemes)
             {
                 PhonemeBuilder subBuilder = PhonemeBuilder.Empty(phoneme.Languages);
                 string phonemeText = phoneme.GetPhonemeText();

                 for (int i = 0; i < phonemeText.Length;)
                 {
                     RulesApplication rulesApplication =
                             new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).Invoke();
                     bool found = rulesApplication.IsFound;
                     subBuilder = rulesApplication.PhonemeBuilder;

                     if (!found)
                     {
                         // not found, appending as-is
                         subBuilder.Append(phonemeText.Substring(i, 1));
                     }

                     i = rulesApplication.I;
                 }

                 phonemes.UnionWith(subBuilder.Phonemes);
             }

             return new PhonemeBuilder(phonemes);
         }

         /// <summary>
         /// Encodes a string to its phonetic representation.
         /// </summary>
         /// <param name="input">The string to encode.</param>
         /// <returns>The encoding of the input.</returns>
         public virtual string Encode(string input)
         {
             LanguageSet languageSet = this.lang.GuessLanguages(input);
             return Encode(input, languageSet);
         }

         /// <summary>
         /// Encodes an input string into an output phonetic representation, given a set of possible origin languages.
         /// </summary>
         /// <param name="input">String to phoneticise; a string with dashes or spaces separating each word.</param>
         /// <param name="languageSet"></param>
         /// <returns>A phonetic representation of the input; a string containing '-'-separated phonetic representations of the input.</returns>
         public virtual string Encode(string input, LanguageSet languageSet)
         {
             IDictionary<string, IList<Rule>> rules = Rule.GetInstanceMap(this.nameType, RuleType.RULES, languageSet);
             // rules common across many (all) languages
             IDictionary<string, IList<Rule>> finalRules1 = Rule.GetInstanceMap(this.nameType, this.ruleType, "common");
             // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
             IDictionary<string, IList<Rule>> finalRules2 = Rule.GetInstanceMap(this.nameType, this.ruleType, languageSet);

             // tidy the input
             // lower case is a locale-dependent operation
             input = input.ToLowerInvariant().Replace('-', ' ').Trim();

             if (this.nameType == NameType.GENERIC)
             {
                 if (input.Length >= 2 && input.Substring(0, 2 - 0).Equals("d'", StringComparison.Ordinal))
                 { // check for d'
                     string remainder = input.Substring(2);
                     string combined = "d" + remainder;
                     return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
                 }
                 foreach (string l in NAME_PREFIXES[this.nameType])
                 {
                     // handle generic prefixes
                     if (input.StartsWith(l + " ", StringComparison.Ordinal))
                     {
                         // check for any prefix in the words list
                         string remainder = input.Substring(l.Length + 1); // input without the prefix
                         string combined = l + remainder; // input with prefix without space
                         return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
                     }
                 }
             }

             IList<string> words = WHITESPACE.Split(input).TrimEnd();
             IList<string> words2 = new List<string>();

             // special-case handling of word prefixes based upon the name type
             switch (this.nameType)
             {
                 case NameType.SEPHARDIC:
                     foreach (string aWord in words)
                     {
                         string[] parts = aWord.Split('\'').TrimEnd();
                         string lastPart = parts[parts.Length - 1];
                         words2.Add(lastPart);
                     }
                     words2.RemoveAll(NAME_PREFIXES[this.nameType]);
                     break;
                 case NameType.ASHKENAZI:
                     words2.AddRange(words);
                     words2.RemoveAll(NAME_PREFIXES[this.nameType]);
                     break;
                 case NameType.GENERIC:
                     words2.AddRange(words);
                     break;
                 default:
                     throw new InvalidOperationException("Unreachable case: " + this.nameType);
             }

             if (this.concat)
             {
                 // concat mode enabled
                 input = Join(words2, " ");
             }
             else if (words2.Count == 1)
             {
                 // not a multi-word name
                 //input = words.iterator().next();
                 input = words[0];
             }
             else
             {
                 // encode each word in a multi-word name separately (normally used for approx matches)
                 StringBuilder result = new StringBuilder();
                 foreach (string word in words2)
                 {
                     result.Append("-").Append(Encode(word));
                 }
                 // return the result without the leading "-"
                 return result.ToString(1, result.Length - 1);
             }

             PhonemeBuilder phonemeBuilder = PhonemeBuilder.Empty(languageSet);

             // loop over each char in the input - we will handle the increment manually
             for (int i = 0; i < input.Length;)
             {
                 RulesApplication rulesApplication =
                         new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).Invoke();
                 i = rulesApplication.I;
                 phonemeBuilder = rulesApplication.PhonemeBuilder;
             }

             // Apply the general rules
             phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules1);
             // Apply the language-specific rules
             phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules2);

             return phonemeBuilder.MakeString();
         }

         /// <summary>
         /// Gets the Lang language guessing rules being used.
         /// </summary>
         public virtual Lang Lang => lang;

         /// <summary>
         /// Gets the <see cref="Bm.NameType"/> being used.
         /// </summary>
         public virtual NameType NameType => nameType;

         /// <summary>
         /// Gets the <see cref="Bm.RuleType"/> being used.
         /// </summary>
         public virtual RuleType RuleType => ruleType;

         /// <summary>
         /// Gets if multiple phonetic encodings are concatenated or if just the first one is kept.
         /// Returns <c>true</c> if multiple phonetic encodings are returned, <c>false</c> if just the first is.
         /// </summary>
         public virtual bool IsConcat => concat;

         /// <summary>
         /// Gets the maximum number of phonemes the engine will calculate for a given input.
         /// <para/>
         /// since 1.7
         /// </summary>
         public virtual int MaxPhonemes => maxPhonemes;
     }
 }
	// commons-codec version compatibility level: 1.9
	using J2N.Collections.Generic.Extensions;
	using J2N.Text;
	using Lucene.Net.Support;
	using Lucene.Net.Util;
	using System;
	using System.Collections.Generic;
	using System.Text;
	using System.Text.RegularExpressions;
	using JCG = J2N.Collections.Generic;

	namespace Lucene.Net.Analysis.Phonetic.Language.Bm
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Converts words into potential phonetic representations.
	/// </summary>
	/// <remarks>
	/// This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes
	/// into account the likely source language. Next, this phonetic representation is converted into a
	/// pan-European 'average' representation, allowing comparison between different versions of essentially
	/// the same word from different languages.
	/// <para/>
	/// This class is intentionally immutable and thread-safe.
	/// If you wish to alter the settings for a PhoneticEngine, you
	/// must make a new one with the updated settings.
	/// <para/>
	/// Ported from phoneticengine.php
	/// <para/>
	/// since 1.6
	/// </remarks>
	public class PhoneticEngine
	{
	internal Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled);

	/// <summary>
	/// Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside
	/// this package, and probably not outside the <see cref="PhoneticEngine"/> class.
	/// <para/>
	/// since 1.6
	/// </summary>
	internal sealed class PhonemeBuilder
	{
	/// <summary>
	/// An empty builder where all phonemes must come from some set of languages. This will contain a single
	/// phoneme of zero characters. This can then be appended to. This should be the only way to create a new
	/// phoneme from scratch.
	/// </summary>
	/// <param name="languages">The set of languages.</param>
	/// <returns>A new, empty phoneme builder.</returns>
	public static PhonemeBuilder Empty(LanguageSet languages)
	{
	return new PhonemeBuilder(new Phoneme("", languages));
	}

	private readonly ISet<Phoneme> phonemes;

	private PhonemeBuilder(Phoneme phoneme)
	{
	this.phonemes = new JCG.LinkedHashSet<Phoneme>
	{
	phoneme
	};
	}

	internal PhonemeBuilder(ISet<Phoneme> phonemes)
	{
	this.phonemes = phonemes;
	}

	/// <summary>
	/// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
	/// </summary>
	/// <param name="str">The characters to append to the phonemes.</param>
	public void Append(ICharSequence str)
	{
	foreach (Phoneme ph in this.phonemes)
	{
	ph.Append(str.ToString());
	}
	}

	/// <summary>
	/// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
	/// </summary>
	/// <param name="str">The characters to append to the phonemes.</param>
	// LUCENENET specific
	public void Append(string str)
	{
	foreach (Phoneme ph in this.phonemes)
	{
	ph.Append(str);
	}
	}

	/// <summary>
	/// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
	/// </summary>
	/// <param name="str">The characters to append to the phonemes.</param>
	// LUCENENET specific
	public void Append(StringBuilder str)
	{
	foreach (Phoneme ph in this.phonemes)
	{
	ph.Append(str.ToString());
	}
	}

	/// <summary>
	/// Applies the given phoneme expression to all phonemes in this phoneme builder.
	/// <para/>
	/// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
	/// incompatible.
	/// </summary>
	/// <param name="phonemeExpr">The expression to apply.</param>
	/// <param name="maxPhonemes">The maximum number of phonemes to build up.</param>
	public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes)
	{
	ISet<Phoneme> newPhonemes = new JCG.LinkedHashSet<Phoneme>(maxPhonemes);

	//EXPR_continue:
	foreach (Phoneme left in this.phonemes)
	{
	foreach (Phoneme right in phonemeExpr.Phonemes)
	{
	LanguageSet languages = left.Languages.RestrictTo(right.Languages);
	if (!languages.IsEmpty)
	{
	Phoneme join = new Phoneme(left, right, languages);
	if (newPhonemes.Count < maxPhonemes)
	{
	newPhonemes.Add(join);
	if (newPhonemes.Count >= maxPhonemes)
	{
	goto EXPR_break;
	}
	}
	}
	}
	}
	EXPR_break: { }

	this.phonemes.Clear();
	this.phonemes.UnionWith(newPhonemes);
	}

	/// <summary>
	/// Gets underlying phoneme set. Please don't mutate.
	/// </summary>
	public ISet<Phoneme> Phonemes => phonemes;

	/// <summary>
	/// Stringifies the phoneme set. This produces a single string of the strings of each phoneme,
	/// joined with a pipe. This is explicitly provided in place of <see cref="object.ToString()"/> as it is a potentially
	/// expensive operation, which should be avoided when debugging.
	/// </summary>
	/// <returns>The stringified phoneme set.</returns>
	public string MakeString()
	{
	StringBuilder sb = new StringBuilder();

	foreach (Phoneme ph in this.phonemes)
	{
	if (sb.Length > 0)
	{
	sb.Append("\|");
	}
	sb.Append(ph.GetPhonemeText());
	}

	return sb.ToString();
	}
	}

	/// <summary>
	/// A function closure capturing the application of a list of rules to an input sequence at a particular offset.
	/// After invocation, the values <c>i</c> and <c>found</c> are updated. <c>i</c> points to the
	/// index of the next char in <c>input</c> that must be processed next (the input up to that index having been
	/// processed already), and <c>found</c> indicates if a matching rule was found or not. In the case where a
	/// matching rule was found, <c>phonemeBuilder</c> is replaced with a new builder containing the phonemes
	/// updated by the matching rule.
	/// <para/>
	/// Although this class is not thread-safe (it has mutable unprotected fields), it is not shared between threads
	/// as it is constructed as needed by the calling methods.
	/// <para/>
	/// since 1.6
	/// </summary>
	private sealed class RulesApplication
	{
	private readonly IDictionary<string, IList<Rule>> finalRules;
	private readonly string input;

	private readonly PhonemeBuilder phonemeBuilder;
	private int i;
	private readonly int maxPhonemes;
	private bool found;

	public RulesApplication(IDictionary<string, IList<Rule>> finalRules, string input,
	PhonemeBuilder phonemeBuilder, int i, int maxPhonemes)
	{
	this.finalRules = finalRules ?? throw new ArgumentNullException(nameof(finalRules), "The finalRules argument must not be null");
	this.phonemeBuilder = phonemeBuilder;
	this.input = input;
	this.i = i;
	this.maxPhonemes = maxPhonemes;
	}

	public int I => i;

	public PhonemeBuilder PhonemeBuilder => phonemeBuilder;

	/// <summary>
	/// Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context
	/// and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no
	/// match, <c>i</c> is advanced one and the character is silently dropped from the phonetic spelling.
	/// </summary>
	/// <returns><c>this</c></returns>
	public RulesApplication Invoke()
	{
	this.found = false;
	int patternLength = 1;
	if (this.finalRules.TryGetValue(input.Substring(i, patternLength), out IList<Rule> rules) && rules != null)
	{
	foreach (Rule rule in rules)
	{
	string pattern = rule.Pattern;
	patternLength = pattern.Length;
	if (rule.PatternAndContextMatches(this.input, this.i))
	{
	this.phonemeBuilder.Apply(rule.Phoneme, maxPhonemes);
	this.found = true;
	break;
	}
	}
	}

	if (!this.found)
	{
	patternLength = 1;
	}

	this.i += patternLength;
	return this;
	}

	public bool IsFound => found;
	}

	private static readonly IDictionary<NameType, ISet<string>> NAME_PREFIXES = LoadNamePrefixes();

	private static IDictionary<NameType, ISet<string>> LoadNamePrefixes() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
	{
	return new Dictionary<NameType, ISet<string>>
	{
	[NameType.ASHKENAZI] = new JCG.HashSet<string>() { "bar", "ben", "da", "de", "van", "von" }.AsReadOnly(),
	[NameType.SEPHARDIC] = new JCG.HashSet<string>() { "al", "el", "da", "dal", "de", "del", "dela", "de la",
	"della", "des", "di", "do", "dos", "du", "van", "von" }.AsReadOnly(),
	[NameType.GENERIC] = new JCG.HashSet<string>() { "da", "dal", "de", "del", "dela", "de la", "della",
	"des", "di", "do", "dos", "du", "van", "von" }.AsReadOnly()
	};
	}

	/// <summary>
	/// Joins some strings with an internal separator.
	/// </summary>
	/// <param name="strings">Strings to join.</param>
	/// <param name="sep">String to separate them with.</param>
	/// <returns>A single string consisting of each element of <paramref name="strings"/> interleaved by <paramref name="sep"/>.</returns>
	private static string Join(IEnumerable<string> strings, string sep)
	{
	StringBuilder sb = new StringBuilder();
	using (IEnumerator<string> si = strings.GetEnumerator())
	{
	if (si.MoveNext())
	{
	sb.Append(si.Current);
	}
	while (si.MoveNext())
	{
	sb.Append(sep).Append(si.Current);
	}
	}

	return sb.ToString();
	}

	private const int DEFAULT_MAX_PHONEMES = 20;

	private readonly Lang lang;

	private readonly NameType nameType;

	private readonly RuleType ruleType;

	private readonly bool concat;

	private readonly int maxPhonemes;

	/// <summary>
	/// Generates a new, fully-configured phonetic engine.
	/// </summary>
	/// <param name="nameType">The type of names it will use.</param>
	/// <param name="ruleType">The type of rules it will apply.</param>
	/// <param name="concat">If it will concatenate multiple encodings.</param>
	public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat)
	: this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES)
	{
	}

	/// <summary>
	/// Generates a new, fully-configured phonetic engine.
	/// <para/>
	/// since 1.7
	/// </summary>
	/// <param name="nameType">The type of names it will use.</param>
	/// <param name="ruleType">The type of rules it will apply.</param>
	/// <param name="concat">If it will concatenate multiple encodings.</param>
	/// <param name="maxPhonemes">The maximum number of phonemes that will be handled.</param>
	public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat,
	int maxPhonemes)
	{
	if (ruleType == RuleType.RULES)
	{
	throw new ArgumentException("ruleType must not be " + RuleType.RULES);
	}
	this.nameType = nameType;
	this.ruleType = ruleType;
	this.concat = concat;
	this.lang = Lang.GetInstance(nameType);
	this.maxPhonemes = maxPhonemes;
	}

	/// <summary>
	/// Applies the final rules to convert from a language-specific phonetic representation to a
	/// language-independent representation.
	/// </summary>
	/// <param name="phonemeBuilder">The current phonemes.</param>
	/// <param name="finalRules">The final rules to apply.</param>
	/// <returns>The resulting phonemes.</returns>
	private PhonemeBuilder ApplyFinalRules(PhonemeBuilder phonemeBuilder,
	IDictionary<string, IList<Rule>> finalRules)
	{
	if (finalRules == null)
	{
	throw new ArgumentNullException("finalRules can not be null");
	}
	if (finalRules.Count == 0)
	{
	return phonemeBuilder;
	}

	ISet<Phoneme> phonemes = new JCG.SortedSet<Phoneme>(Phoneme.COMPARER);

	foreach (Phoneme phoneme in phonemeBuilder.Phonemes)
	{
	PhonemeBuilder subBuilder = PhonemeBuilder.Empty(phoneme.Languages);
	string phonemeText = phoneme.GetPhonemeText();

	for (int i = 0; i < phonemeText.Length;)
	{
	RulesApplication rulesApplication =
	new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).Invoke();
	bool found = rulesApplication.IsFound;
	subBuilder = rulesApplication.PhonemeBuilder;

	if (!found)
	{
	// not found, appending as-is
	subBuilder.Append(phonemeText.Substring(i, 1));
	}

	i = rulesApplication.I;
	}

	phonemes.UnionWith(subBuilder.Phonemes);
	}

	return new PhonemeBuilder(phonemes);
	}

	/// <summary>
	/// Encodes a string to its phonetic representation.
	/// </summary>
	/// <param name="input">The string to encode.</param>
	/// <returns>The encoding of the input.</returns>
	public virtual string Encode(string input)
	{
	LanguageSet languageSet = this.lang.GuessLanguages(input);
	return Encode(input, languageSet);
	}

	/// <summary>
	/// Encodes an input string into an output phonetic representation, given a set of possible origin languages.
	/// </summary>
	/// <param name="input">String to phoneticise; a string with dashes or spaces separating each word.</param>
	/// <param name="languageSet"></param>
	/// <returns>A phonetic representation of the input; a string containing '-'-separated phonetic representations of the input.</returns>
	public virtual string Encode(string input, LanguageSet languageSet)
	{
	IDictionary<string, IList<Rule>> rules = Rule.GetInstanceMap(this.nameType, RuleType.RULES, languageSet);
	// rules common across many (all) languages
	IDictionary<string, IList<Rule>> finalRules1 = Rule.GetInstanceMap(this.nameType, this.ruleType, "common");
	// rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
	IDictionary<string, IList<Rule>> finalRules2 = Rule.GetInstanceMap(this.nameType, this.ruleType, languageSet);

	// tidy the input
	// lower case is a locale-dependent operation
	input = input.ToLowerInvariant().Replace('-', ' ').Trim();

	if (this.nameType == NameType.GENERIC)
	{
	if (input.Length >= 2 && input.Substring(0, 2 - 0).Equals("d'", StringComparison.Ordinal))
	{ // check for d'
	string remainder = input.Substring(2);
	string combined = "d" + remainder;
	return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
	}
	foreach (string l in NAME_PREFIXES[this.nameType])
	{
	// handle generic prefixes
	if (input.StartsWith(l + " ", StringComparison.Ordinal))
	{
	// check for any prefix in the words list
	string remainder = input.Substring(l.Length + 1); // input without the prefix
	string combined = l + remainder; // input with prefix without space
	return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
	}
	}
	}

	IList<string> words = WHITESPACE.Split(input).TrimEnd();
	IList<string> words2 = new List<string>();

	// special-case handling of word prefixes based upon the name type
	switch (this.nameType)
	{
	case NameType.SEPHARDIC:
	foreach (string aWord in words)
	{
	string[] parts = aWord.Split('\'').TrimEnd();
	string lastPart = parts[parts.Length - 1];
	words2.Add(lastPart);
	}
	words2.RemoveAll(NAME_PREFIXES[this.nameType]);
	break;
	case NameType.ASHKENAZI:
	words2.AddRange(words);
	words2.RemoveAll(NAME_PREFIXES[this.nameType]);
	break;
	case NameType.GENERIC:
	words2.AddRange(words);
	break;
	default:
	throw new InvalidOperationException("Unreachable case: " + this.nameType);
	}

	if (this.concat)
	{
	// concat mode enabled
	input = Join(words2, " ");
	}
	else if (words2.Count == 1)
	{
	// not a multi-word name
	//input = words.iterator().next();
	input = words[0];
	}
	else
	{
	// encode each word in a multi-word name separately (normally used for approx matches)
	StringBuilder result = new StringBuilder();
	foreach (string word in words2)
	{
	result.Append("-").Append(Encode(word));
	}
	// return the result without the leading "-"
	return result.ToString(1, result.Length - 1);
	}

	PhonemeBuilder phonemeBuilder = PhonemeBuilder.Empty(languageSet);

	// loop over each char in the input - we will handle the increment manually
	for (int i = 0; i < input.Length;)
	{
	RulesApplication rulesApplication =
	new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).Invoke();
	i = rulesApplication.I;
	phonemeBuilder = rulesApplication.PhonemeBuilder;
	}

	// Apply the general rules
	phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules1);
	// Apply the language-specific rules
	phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules2);

	return phonemeBuilder.MakeString();
	}

	/// <summary>
	/// Gets the Lang language guessing rules being used.
	/// </summary>
	public virtual Lang Lang => lang;

	/// <summary>
	/// Gets the <see cref="Bm.NameType"/> being used.
	/// </summary>
	public virtual NameType NameType => nameType;

	/// <summary>
	/// Gets the <see cref="Bm.RuleType"/> being used.
	/// </summary>
	public virtual RuleType RuleType => ruleType;

	/// <summary>
	/// Gets if multiple phonetic encodings are concatenated or if just the first one is kept.
	/// Returns <c>true</c> if multiple phonetic encodings are returned, <c>false</c> if just the first is.
	/// </summary>
	public virtual bool IsConcat => concat;

	/// <summary>
	/// Gets the maximum number of phonemes the engine will calculate for a given input.
	/// <para/>
	/// since 1.7
	/// </summary>
	public virtual int MaxPhonemes => maxPhonemes;
	}
	}