blob: fedc8ded7ee19e0f83d2ff98f813d52b12698b63 [file] [log] [blame]
// commons-codec version compatibility level: 1.9
using J2N.Collections.Generic.Extensions;
using J2N.Text;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Analysis.Phonetic.Language.Bm
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Converts words into potential phonetic representations.
/// </summary>
/// <remarks>
/// This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes
/// into account the likely source language. Next, this phonetic representation is converted into a
/// pan-European 'average' representation, allowing comparison between different versions of essentially
/// the same word from different languages.
/// <para/>
/// This class is intentionally immutable and thread-safe.
/// If you wish to alter the settings for a PhoneticEngine, you
/// must make a new one with the updated settings.
/// <para/>
/// Ported from phoneticengine.php
/// <para/>
/// since 1.6
/// </remarks>
public class PhoneticEngine
{
internal Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled);
/// <summary>
/// Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside
/// this package, and probably not outside the <see cref="PhoneticEngine"/> class.
/// <para/>
/// since 1.6
/// </summary>
internal sealed class PhonemeBuilder
{
/// <summary>
/// An empty builder where all phonemes must come from some set of languages. This will contain a single
/// phoneme of zero characters. This can then be appended to. This should be the only way to create a new
/// phoneme from scratch.
/// </summary>
/// <param name="languages">The set of languages.</param>
/// <returns>A new, empty phoneme builder.</returns>
public static PhonemeBuilder Empty(LanguageSet languages)
{
return new PhonemeBuilder(new Phoneme("", languages));
}
private readonly ISet<Phoneme> phonemes;
private PhonemeBuilder(Phoneme phoneme)
{
this.phonemes = new JCG.LinkedHashSet<Phoneme>
{
phoneme
};
}
internal PhonemeBuilder(ISet<Phoneme> phonemes)
{
this.phonemes = phonemes;
}
/// <summary>
/// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
/// </summary>
/// <param name="str">The characters to append to the phonemes.</param>
public void Append(ICharSequence str)
{
foreach (Phoneme ph in this.phonemes)
{
ph.Append(str.ToString());
}
}
/// <summary>
/// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
/// </summary>
/// <param name="str">The characters to append to the phonemes.</param>
// LUCENENET specific
public void Append(string str)
{
foreach (Phoneme ph in this.phonemes)
{
ph.Append(str);
}
}
/// <summary>
/// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
/// </summary>
/// <param name="str">The characters to append to the phonemes.</param>
// LUCENENET specific
public void Append(StringBuilder str)
{
foreach (Phoneme ph in this.phonemes)
{
ph.Append(str.ToString());
}
}
/// <summary>
/// Applies the given phoneme expression to all phonemes in this phoneme builder.
/// <para/>
/// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
/// incompatible.
/// </summary>
/// <param name="phonemeExpr">The expression to apply.</param>
/// <param name="maxPhonemes">The maximum number of phonemes to build up.</param>
public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes)
{
ISet<Phoneme> newPhonemes = new JCG.LinkedHashSet<Phoneme>(maxPhonemes);
//EXPR_continue:
foreach (Phoneme left in this.phonemes)
{
foreach (Phoneme right in phonemeExpr.Phonemes)
{
LanguageSet languages = left.Languages.RestrictTo(right.Languages);
if (!languages.IsEmpty)
{
Phoneme join = new Phoneme(left, right, languages);
if (newPhonemes.Count < maxPhonemes)
{
newPhonemes.Add(join);
if (newPhonemes.Count >= maxPhonemes)
{
goto EXPR_break;
}
}
}
}
}
EXPR_break: { }
this.phonemes.Clear();
this.phonemes.UnionWith(newPhonemes);
}
/// <summary>
/// Gets underlying phoneme set. Please don't mutate.
/// </summary>
public ISet<Phoneme> Phonemes => phonemes;
/// <summary>
/// Stringifies the phoneme set. This produces a single string of the strings of each phoneme,
/// joined with a pipe. This is explicitly provided in place of <see cref="object.ToString()"/> as it is a potentially
/// expensive operation, which should be avoided when debugging.
/// </summary>
/// <returns>The stringified phoneme set.</returns>
public string MakeString()
{
StringBuilder sb = new StringBuilder();
foreach (Phoneme ph in this.phonemes)
{
if (sb.Length > 0)
{
sb.Append("|");
}
sb.Append(ph.GetPhonemeText());
}
return sb.ToString();
}
}
/// <summary>
/// A function closure capturing the application of a list of rules to an input sequence at a particular offset.
/// After invocation, the values <c>i</c> and <c>found</c> are updated. <c>i</c> points to the
/// index of the next char in <c>input</c> that must be processed next (the input up to that index having been
/// processed already), and <c>found</c> indicates if a matching rule was found or not. In the case where a
/// matching rule was found, <c>phonemeBuilder</c> is replaced with a new builder containing the phonemes
/// updated by the matching rule.
/// <para/>
/// Although this class is not thread-safe (it has mutable unprotected fields), it is not shared between threads
/// as it is constructed as needed by the calling methods.
/// <para/>
/// since 1.6
/// </summary>
private sealed class RulesApplication
{
private readonly IDictionary<string, IList<Rule>> finalRules;
private readonly string input;
private readonly PhonemeBuilder phonemeBuilder;
private int i;
private readonly int maxPhonemes;
private bool found;
public RulesApplication(IDictionary<string, IList<Rule>> finalRules, string input,
PhonemeBuilder phonemeBuilder, int i, int maxPhonemes)
{
this.finalRules = finalRules ?? throw new ArgumentNullException(nameof(finalRules), "The finalRules argument must not be null");
this.phonemeBuilder = phonemeBuilder;
this.input = input;
this.i = i;
this.maxPhonemes = maxPhonemes;
}
public int I => i;
public PhonemeBuilder PhonemeBuilder => phonemeBuilder;
/// <summary>
/// Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context
/// and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no
/// match, <c>i</c> is advanced one and the character is silently dropped from the phonetic spelling.
/// </summary>
/// <returns><c>this</c></returns>
public RulesApplication Invoke()
{
this.found = false;
int patternLength = 1;
if (this.finalRules.TryGetValue(input.Substring(i, patternLength), out IList<Rule> rules) && rules != null)
{
foreach (Rule rule in rules)
{
string pattern = rule.Pattern;
patternLength = pattern.Length;
if (rule.PatternAndContextMatches(this.input, this.i))
{
this.phonemeBuilder.Apply(rule.Phoneme, maxPhonemes);
this.found = true;
break;
}
}
}
if (!this.found)
{
patternLength = 1;
}
this.i += patternLength;
return this;
}
public bool IsFound => found;
}
private static readonly IDictionary<NameType, ISet<string>> NAME_PREFIXES = LoadNamePrefixes();
private static IDictionary<NameType, ISet<string>> LoadNamePrefixes() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
{
return new Dictionary<NameType, ISet<string>>
{
[NameType.ASHKENAZI] = new JCG.HashSet<string>() { "bar", "ben", "da", "de", "van", "von" }.AsReadOnly(),
[NameType.SEPHARDIC] = new JCG.HashSet<string>() { "al", "el", "da", "dal", "de", "del", "dela", "de la",
"della", "des", "di", "do", "dos", "du", "van", "von" }.AsReadOnly(),
[NameType.GENERIC] = new JCG.HashSet<string>() { "da", "dal", "de", "del", "dela", "de la", "della",
"des", "di", "do", "dos", "du", "van", "von" }.AsReadOnly()
};
}
/// <summary>
/// Joins some strings with an internal separator.
/// </summary>
/// <param name="strings">Strings to join.</param>
/// <param name="sep">String to separate them with.</param>
/// <returns>A single string consisting of each element of <paramref name="strings"/> interleaved by <paramref name="sep"/>.</returns>
private static string Join(IEnumerable<string> strings, string sep)
{
StringBuilder sb = new StringBuilder();
using (IEnumerator<string> si = strings.GetEnumerator())
{
if (si.MoveNext())
{
sb.Append(si.Current);
}
while (si.MoveNext())
{
sb.Append(sep).Append(si.Current);
}
}
return sb.ToString();
}
private const int DEFAULT_MAX_PHONEMES = 20;
private readonly Lang lang;
private readonly NameType nameType;
private readonly RuleType ruleType;
private readonly bool concat;
private readonly int maxPhonemes;
/// <summary>
/// Generates a new, fully-configured phonetic engine.
/// </summary>
/// <param name="nameType">The type of names it will use.</param>
/// <param name="ruleType">The type of rules it will apply.</param>
/// <param name="concat">If it will concatenate multiple encodings.</param>
public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat)
: this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES)
{
}
/// <summary>
/// Generates a new, fully-configured phonetic engine.
/// <para/>
/// since 1.7
/// </summary>
/// <param name="nameType">The type of names it will use.</param>
/// <param name="ruleType">The type of rules it will apply.</param>
/// <param name="concat">If it will concatenate multiple encodings.</param>
/// <param name="maxPhonemes">The maximum number of phonemes that will be handled.</param>
public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat,
int maxPhonemes)
{
if (ruleType == RuleType.RULES)
{
throw new ArgumentException("ruleType must not be " + RuleType.RULES);
}
this.nameType = nameType;
this.ruleType = ruleType;
this.concat = concat;
this.lang = Lang.GetInstance(nameType);
this.maxPhonemes = maxPhonemes;
}
/// <summary>
/// Applies the final rules to convert from a language-specific phonetic representation to a
/// language-independent representation.
/// </summary>
/// <param name="phonemeBuilder">The current phonemes.</param>
/// <param name="finalRules">The final rules to apply.</param>
/// <returns>The resulting phonemes.</returns>
private PhonemeBuilder ApplyFinalRules(PhonemeBuilder phonemeBuilder,
IDictionary<string, IList<Rule>> finalRules)
{
if (finalRules == null)
{
throw new ArgumentNullException("finalRules can not be null");
}
if (finalRules.Count == 0)
{
return phonemeBuilder;
}
ISet<Phoneme> phonemes = new JCG.SortedSet<Phoneme>(Phoneme.COMPARER);
foreach (Phoneme phoneme in phonemeBuilder.Phonemes)
{
PhonemeBuilder subBuilder = PhonemeBuilder.Empty(phoneme.Languages);
string phonemeText = phoneme.GetPhonemeText();
for (int i = 0; i < phonemeText.Length;)
{
RulesApplication rulesApplication =
new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).Invoke();
bool found = rulesApplication.IsFound;
subBuilder = rulesApplication.PhonemeBuilder;
if (!found)
{
// not found, appending as-is
subBuilder.Append(phonemeText.Substring(i, 1));
}
i = rulesApplication.I;
}
phonemes.UnionWith(subBuilder.Phonemes);
}
return new PhonemeBuilder(phonemes);
}
/// <summary>
/// Encodes a string to its phonetic representation.
/// </summary>
/// <param name="input">The string to encode.</param>
/// <returns>The encoding of the input.</returns>
public virtual string Encode(string input)
{
LanguageSet languageSet = this.lang.GuessLanguages(input);
return Encode(input, languageSet);
}
/// <summary>
/// Encodes an input string into an output phonetic representation, given a set of possible origin languages.
/// </summary>
/// <param name="input">String to phoneticise; a string with dashes or spaces separating each word.</param>
/// <param name="languageSet"></param>
/// <returns>A phonetic representation of the input; a string containing '-'-separated phonetic representations of the input.</returns>
public virtual string Encode(string input, LanguageSet languageSet)
{
IDictionary<string, IList<Rule>> rules = Rule.GetInstanceMap(this.nameType, RuleType.RULES, languageSet);
// rules common across many (all) languages
IDictionary<string, IList<Rule>> finalRules1 = Rule.GetInstanceMap(this.nameType, this.ruleType, "common");
// rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
IDictionary<string, IList<Rule>> finalRules2 = Rule.GetInstanceMap(this.nameType, this.ruleType, languageSet);
// tidy the input
// lower case is a locale-dependent operation
input = input.ToLowerInvariant().Replace('-', ' ').Trim();
if (this.nameType == NameType.GENERIC)
{
if (input.Length >= 2 && input.Substring(0, 2 - 0).Equals("d'", StringComparison.Ordinal))
{ // check for d'
string remainder = input.Substring(2);
string combined = "d" + remainder;
return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
}
foreach (string l in NAME_PREFIXES[this.nameType])
{
// handle generic prefixes
if (input.StartsWith(l + " ", StringComparison.Ordinal))
{
// check for any prefix in the words list
string remainder = input.Substring(l.Length + 1); // input without the prefix
string combined = l + remainder; // input with prefix without space
return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
}
}
}
IList<string> words = WHITESPACE.Split(input).TrimEnd();
IList<string> words2 = new List<string>();
// special-case handling of word prefixes based upon the name type
switch (this.nameType)
{
case NameType.SEPHARDIC:
foreach (string aWord in words)
{
string[] parts = aWord.Split('\'').TrimEnd();
string lastPart = parts[parts.Length - 1];
words2.Add(lastPart);
}
words2.RemoveAll(NAME_PREFIXES[this.nameType]);
break;
case NameType.ASHKENAZI:
words2.AddRange(words);
words2.RemoveAll(NAME_PREFIXES[this.nameType]);
break;
case NameType.GENERIC:
words2.AddRange(words);
break;
default:
throw new InvalidOperationException("Unreachable case: " + this.nameType);
}
if (this.concat)
{
// concat mode enabled
input = Join(words2, " ");
}
else if (words2.Count == 1)
{
// not a multi-word name
//input = words.iterator().next();
input = words[0];
}
else
{
// encode each word in a multi-word name separately (normally used for approx matches)
StringBuilder result = new StringBuilder();
foreach (string word in words2)
{
result.Append("-").Append(Encode(word));
}
// return the result without the leading "-"
return result.ToString(1, result.Length - 1);
}
PhonemeBuilder phonemeBuilder = PhonemeBuilder.Empty(languageSet);
// loop over each char in the input - we will handle the increment manually
for (int i = 0; i < input.Length;)
{
RulesApplication rulesApplication =
new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).Invoke();
i = rulesApplication.I;
phonemeBuilder = rulesApplication.PhonemeBuilder;
}
// Apply the general rules
phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules1);
// Apply the language-specific rules
phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules2);
return phonemeBuilder.MakeString();
}
/// <summary>
/// Gets the Lang language guessing rules being used.
/// </summary>
public virtual Lang Lang => lang;
/// <summary>
/// Gets the <see cref="Bm.NameType"/> being used.
/// </summary>
public virtual NameType NameType => nameType;
/// <summary>
/// Gets the <see cref="Bm.RuleType"/> being used.
/// </summary>
public virtual RuleType RuleType => ruleType;
/// <summary>
/// Gets if multiple phonetic encodings are concatenated or if just the first one is kept.
/// Returns <c>true</c> if multiple phonetic encodings are returned, <c>false</c> if just the first is.
/// </summary>
public virtual bool IsConcat => concat;
/// <summary>
/// Gets the maximum number of phonemes the engine will calculate for a given input.
/// <para/>
/// since 1.7
/// </summary>
public virtual int MaxPhonemes => maxPhonemes;
}
}