blob: c10b9926f5e6369e763c68f4127f4403b26228af [file] [log] [blame]
// commons-codec version compatibility level: 1.9
using System;
using System.Globalization;
using System.Text;
namespace Lucene.Net.Analysis.Phonetic.Language
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Encodes a string into a Metaphone value.
/// <para/>
/// Initial Java implementation by <c>William B. Brogden. December, 1997</c>.
/// Permission given by <c>wbrogden</c> for code to be used anywhere.
/// <para/>
/// <c>Hanging on the Metaphone</c> by <c>Lawrence Philips</c> in <c>Computer Language of Dec. 1990,
/// p 39.</c>
/// <para/>
/// Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
/// <para/>
/// <list type="bullet">
/// <item><description><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> (broken link 4/30/2013) </description></item>
/// <item><description><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> (link checked 4/30/2013) </description></item>
/// </list>
/// <para/>
/// They have had undocumented changes from the originally published algorithm.
/// For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
/// <para/>
/// This class is conditionally thread-safe.
/// The instance field <see cref="maxCodeLen"/> is mutable <see cref="MaxCodeLen"/>
/// but is not volatile, and accesses are not synchronized.
/// If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
/// is used to ensure safe publication of the value between threads, and must not set <see cref="MaxCodeLen"/>
/// after initial setup.
/// </summary>
public class Metaphone : IStringEncoder
{
/// <summary>
/// Five values in the English language
/// </summary>
private const string VOWELS = "AEIOU";
/// <summary>
/// Variable used in Metaphone algorithm
/// </summary>
private const string FRONTV = "EIY";
/// <summary>
/// Variable used in Metaphone algorithm
/// </summary>
private const string VARSON = "CSPTG";
/// <summary>
/// The max code length for metaphone is 4
/// </summary>
private int maxCodeLen = 4;
/// <summary>
/// Creates an instance of the <see cref="Metaphone"/> encoder
/// </summary>
public Metaphone()
: base()
{
}
private static readonly CultureInfo LOCALE_ENGLISH = new CultureInfo("en");
/// <summary>
/// Find the metaphone value of a string. This is similar to the
/// soundex algorithm, but better at finding similar sounding words.
/// All input is converted to upper case.
/// Limitations: Input format is expected to be a single ASCII word
/// with only characters in the A - Z range, no punctuation or numbers.
/// </summary>
/// <param name="txt">String to find the metaphone code for.</param>
/// <returns>A metaphone code corresponding to the string supplied.</returns>
public virtual string GetMetaphone(string txt)
{
bool hard; // LUCENENET: IDE0059: Remove unnecessary value assignment
if (txt == null || txt.Length == 0)
{
return "";
}
// single character is itself
if (txt.Length == 1)
{
return LOCALE_ENGLISH.TextInfo.ToUpper(txt);
}
char[] inwd = LOCALE_ENGLISH.TextInfo.ToUpper(txt).ToCharArray();
StringBuilder local = new StringBuilder(40); // manipulate
StringBuilder code = new StringBuilder(10); // output
// handle initial 2 characters exceptions
switch (inwd[0])
{
case 'K':
case 'G':
case 'P': /* looking for KN, etc*/
if (inwd[1] == 'N')
{
local.Append(inwd, 1, inwd.Length - 1);
}
else
{
local.Append(inwd);
}
break;
case 'A': /* looking for AE */
if (inwd[1] == 'E')
{
local.Append(inwd, 1, inwd.Length - 1);
}
else
{
local.Append(inwd);
}
break;
case 'W': /* looking for WR or WH */
if (inwd[1] == 'R')
{ // WR -> R
local.Append(inwd, 1, inwd.Length - 1);
break;
}
if (inwd[1] == 'H')
{
local.Append(inwd, 1, inwd.Length - 1);
local[0] = 'W'; // WH -> W
}
else
{
local.Append(inwd);
}
break;
case 'X': /* initial X becomes S */
inwd[0] = 'S';
local.Append(inwd);
break;
default:
local.Append(inwd);
break;
} // now local has working string with initials fixed
int wdsz = local.Length;
int n = 0;
while (code.Length < this.MaxCodeLen &&
n < wdsz)
{ // max code size of 4 works well
char symb = local[n];
// remove duplicate letters except C
if (symb != 'C' && IsPreviousChar(local, n, symb))
{
n++;
}
else
{ // not dup
switch (symb)
{
case 'A':
case 'E':
case 'I':
case 'O':
case 'U':
if (n == 0)
{
code.Append(symb);
}
break; // only use vowel if leading char
case 'B':
if (IsPreviousChar(local, n, 'M') &&
IsLastChar(wdsz, n))
{ // B is silent if word ends in MB
break;
}
code.Append(symb);
break;
case 'C': // lots of C special cases
/* discard if SCI, SCE or SCY */
if (IsPreviousChar(local, n, 'S') &&
!IsLastChar(wdsz, n) &&
FRONTV.IndexOf(local[n + 1]) >= 0)
{
break;
}
if (RegionMatch(local, n, "CIA"))
{ // "CIA" -> X
code.Append('X');
break;
}
if (!IsLastChar(wdsz, n) &&
FRONTV.IndexOf(local[n + 1]) >= 0)
{
code.Append('S');
break; // CI,CE,CY -> S
}
if (IsPreviousChar(local, n, 'S') &&
IsNextChar(local, n, 'H'))
{ // SCH->sk
code.Append('K');
break;
}
if (IsNextChar(local, n, 'H'))
{ // detect CH
if (n == 0 &&
wdsz >= 3 &&
IsVowel(local, 2))
{ // CH consonant -> K consonant
code.Append('K');
}
else
{
code.Append('X'); // CHvowel -> X
}
}
else
{
code.Append('K');
}
break;
case 'D':
if (!IsLastChar(wdsz, n + 1) &&
IsNextChar(local, n, 'G') &&
FRONTV.IndexOf(local[n + 2]) >= 0)
{ // DGE DGI DGY -> J
code.Append('J'); n += 2;
}
else
{
code.Append('T');
}
break;
case 'G': // GH silent at end or before consonant
if (IsLastChar(wdsz, n + 1) &&
IsNextChar(local, n, 'H'))
{
break;
}
if (!IsLastChar(wdsz, n + 1) &&
IsNextChar(local, n, 'H') &&
!IsVowel(local, n + 2))
{
break;
}
if (n > 0 &&
(RegionMatch(local, n, "GN") ||
RegionMatch(local, n, "GNED")))
{
break; // silent G
}
if (IsPreviousChar(local, n, 'G'))
{
// NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
hard = true;
}
else
{
hard = false;
}
if (!IsLastChar(wdsz, n) &&
FRONTV.IndexOf(local[n + 1]) >= 0 &&
!hard)
{
code.Append('J');
}
else
{
code.Append('K');
}
break;
case 'H':
if (IsLastChar(wdsz, n))
{
break; // terminal H
}
if (n > 0 &&
VARSON.IndexOf(local[n - 1]) >= 0)
{
break;
}
if (IsVowel(local, n + 1))
{
code.Append('H'); // Hvowel
}
break;
case 'F':
case 'J':
case 'L':
case 'M':
case 'N':
case 'R':
code.Append(symb);
break;
case 'K':
if (n > 0)
{ // not initial
if (!IsPreviousChar(local, n, 'C'))
{
code.Append(symb);
}
}
else
{
code.Append(symb); // initial K
}
break;
case 'P':
if (IsNextChar(local, n, 'H'))
{
// PH -> F
code.Append('F');
}
else
{
code.Append(symb);
}
break;
case 'Q':
code.Append('K');
break;
case 'S':
if (RegionMatch(local, n, "SH") ||
RegionMatch(local, n, "SIO") ||
RegionMatch(local, n, "SIA"))
{
code.Append('X');
}
else
{
code.Append('S');
}
break;
case 'T':
if (RegionMatch(local, n, "TIA") ||
RegionMatch(local, n, "TIO"))
{
code.Append('X');
break;
}
if (RegionMatch(local, n, "TCH"))
{
// Silent if in "TCH"
break;
}
// substitute numeral 0 for TH (resembles theta after all)
if (RegionMatch(local, n, "TH"))
{
code.Append('0');
}
else
{
code.Append('T');
}
break;
case 'V':
code.Append('F'); break;
case 'W':
case 'Y': // silent if not followed by vowel
if (!IsLastChar(wdsz, n) &&
IsVowel(local, n + 1))
{
code.Append(symb);
}
break;
case 'X':
code.Append('K');
code.Append('S');
break;
case 'Z':
code.Append('S');
break;
default:
// do nothing
break;
} // end switch
n++;
} // end else from symb != 'C'
if (code.Length > this.MaxCodeLen)
{
code.Length = this.MaxCodeLen;
}
}
return code.ToString();
}
private bool IsVowel(StringBuilder sb, int index)
{
return VOWELS.IndexOf(sb[index]) >= 0;
}
private bool IsPreviousChar(StringBuilder sb, int index, char c)
{
bool matches = false;
if (index > 0 &&
index < sb.Length)
{
matches = sb[index - 1] == c;
}
return matches;
}
private bool IsNextChar(StringBuilder sb, int index, char c)
{
bool matches = false;
if (index >= 0 &&
index < sb.Length - 1)
{
matches = sb[index + 1] == c;
}
return matches;
}
private bool RegionMatch(StringBuilder sb, int index, string test)
{
bool matches = false;
if (index >= 0 &&
index + test.Length - 1 < sb.Length)
{
string substring = sb.ToString(index, test.Length);
matches = substring.Equals(test, StringComparison.Ordinal);
}
return matches;
}
private bool IsLastChar(int wdsz, int n)
{
return n + 1 == wdsz;
}
// LUCENENET specific - in .NET we don't need an object overload of Encode(), since strings are sealed anyway.
/// <summary>
/// Encodes a string using the <see cref="Metaphone"/> algorithm.
/// </summary>
/// <param name="str">String to encode.</param>
/// <returns>The metaphone code corresponding to the string supplied.</returns>
public virtual string Encode(string str)
{
return GetMetaphone(str);
}
/// <summary>
/// Tests is the metaphones of two strings are identical.
/// </summary>
/// <param name="str1">First of two strings to compare.</param>
/// <param name="str2">Second of two strings to compare.</param>
/// <returns><c>true</c> if the metaphones of these strings are identical, <c>false</c> otherwise.</returns>
public virtual bool IsMetaphoneEqual(string str1, string str2)
{
return GetMetaphone(str1).Equals(GetMetaphone(str2), StringComparison.Ordinal);
}
/// <summary>
/// Gets or Sets <see cref="maxCodeLen"/>.
/// </summary>
public virtual int MaxCodeLen
{
get => this.maxCodeLen;
set => this.maxCodeLen = value;
}
}
}