blob: a09a70a1b8a118c08080faeb8162d19bdd5fa45d [file] [log] [blame]
// commons-codec version compatibility level: 1.9
using System;
using System.Text;
using System.Text.RegularExpressions;
namespace Lucene.Net.Analysis.Phonetic.Language
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
/// general purpose scheme to find word with similar phonemes.
/// </summary>
/// <remarks>
/// NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
/// <para/>
/// Algorithm description:
/// <list type="number">
/// <item>
/// <term>Transcode first characters of name</term>
/// <description>
/// <list type="number">
/// <item><description>MAC -> MCC</description></item>
/// <item><description>KN -> NN</description></item>
/// <item><description>K -> C</description></item>
/// <item><description>PH -> FF</description></item>
/// <item><description>PF -> FF</description></item>
/// <item><description>SCH -> SSS</description></item>
/// </list>
/// </description>
/// </item>
/// <item>
/// <term>Transcode last characters of name</term>
/// <description>
/// <list type="number">
/// <item><description>EE, IE -> Y</description></item>
/// <item><description>DT,RT,RD,NT,ND -> D</description></item>
/// </list>
/// </description>
/// </item>
/// <item>
/// <term>First character of key = first character of name</term>
/// </item>
/// <item>
/// <term>Transcode remaining characters by following these rules, incrementing by one character each time</term>
/// <description>
/// <list type="number">
/// <item><description>EV -> AF else A,E,I,O,U -> A</description></item>
/// <item><description>Q -> G</description></item>
/// <item><description>Z -> S</description></item>
/// <item><description>M -> N</description></item>
/// <item><description>KN -> N else K -> C</description></item>
/// <item><description>SCH -> SSS</description></item>
/// <item><description>PH -> FF</description></item>
/// <item><description>H -> If previous or next is nonvowel, previous</description></item>
/// <item><description>W -> If previous is vowel, previous</description></item>
/// <item><description>Add current to key if current != last key character</description></item>
/// </list>
/// </description>
/// </item>
/// <item>
/// <term>If last character is S, remove it</term>
/// </item>
/// <item>
/// <term>If last characters are AY, replace with Y</term>
/// </item>
/// <item>
/// <term>If last character is A, remove it</term>
/// </item>
/// <item>
/// <term>Collapse all strings of repeated characters</term>
/// </item>
/// <item>
/// <term>Add original first character of name as first character of key</term>
/// </item>
/// </list>
/// <para/>
/// This class is immutable and thread-safe.
/// <para/>
/// See: <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
/// <para/>
/// See: <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
/// <para/>
/// since 1.7
/// </remarks>
/// <seealso cref="Soundex"/>
public class Nysiis : IStringEncoder
{
private static readonly char[] CHARS_A = new char[] { 'A' };
private static readonly char[] CHARS_AF = new char[] { 'A', 'F' };
private static readonly char[] CHARS_C = new char[] { 'C' };
private static readonly char[] CHARS_FF = new char[] { 'F', 'F' };
private static readonly char[] CHARS_G = new char[] { 'G' };
private static readonly char[] CHARS_N = new char[] { 'N' };
private static readonly char[] CHARS_NN = new char[] { 'N', 'N' };
private static readonly char[] CHARS_S = new char[] { 'S' };
private static readonly char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
private static readonly Regex PAT_MAC = new Regex("^MAC", RegexOptions.Compiled);
private static readonly Regex PAT_KN = new Regex("^KN", RegexOptions.Compiled);
private static readonly Regex PAT_K = new Regex("^K", RegexOptions.Compiled);
private static readonly Regex PAT_PH_PF = new Regex("^(PH|PF)", RegexOptions.Compiled);
private static readonly Regex PAT_SCH = new Regex("^SCH", RegexOptions.Compiled);
private static readonly Regex PAT_EE_IE = new Regex("(EE|IE)$", RegexOptions.Compiled);
private static readonly Regex PAT_DT_ETC = new Regex("(DT|RT|RD|NT|ND)$", RegexOptions.Compiled);
private const char SPACE = ' ';
private const int TRUE_LENGTH = 6;
/// <summary>
/// Tests if the given character is a vowel.
/// </summary>
/// <param name="c">The character to test.</param>
/// <returns><c>true</c> if the character is a vowel, <c>false</c> otherwise.</returns>
private static bool IsVowel(char c)
{
return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
}
/// <summary>
/// Transcodes the remaining parts of the string. The method operates on a sliding window, looking at 4 characters at
/// a time: [i-1, i, i+1, i+2].
/// </summary>
/// <param name="prev">The previous character.</param>
/// <param name="curr">The current character.</param>
/// <param name="next">The next character.</param>
/// <param name="aNext">The after next character</param>
/// <returns>A transcoded array of characters, starting from the current position.</returns>
private static char[] TranscodeRemaining(char prev, char curr, char next, char aNext)
{
// 1. EV -> AF
if (curr == 'E' && next == 'V')
{
return CHARS_AF;
}
// A, E, I, O, U -> A
if (IsVowel(curr))
{
return CHARS_A;
}
// 2. Q -> G, Z -> S, M -> N
if (curr == 'Q')
{
return CHARS_G;
}
else if (curr == 'Z')
{
return CHARS_S;
}
else if (curr == 'M')
{
return CHARS_N;
}
// 3. KN -> NN else K -> C
if (curr == 'K')
{
if (next == 'N')
{
return CHARS_NN;
}
else
{
return CHARS_C;
}
}
// 4. SCH -> SSS
if (curr == 'S' && next == 'C' && aNext == 'H')
{
return CHARS_SSS;
}
// PH -> FF
if (curr == 'P' && next == 'H')
{
return CHARS_FF;
}
// 5. H -> If previous or next is a non vowel, previous.
if (curr == 'H' && (!IsVowel(prev) || !IsVowel(next)))
{
return new char[] { prev };
}
// 6. W -> If previous is vowel, previous.
if (curr == 'W' && IsVowel(prev))
{
return new char[] { prev };
}
return new char[] { curr };
}
/// <summary>Indicates the strict mode.</summary>
private readonly bool strict;
/// <summary>
/// Creates an instance of the <see cref="Nysiis"/> encoder with strict mode (original form),
/// i.e. encoded strings have a maximum length of 6.
/// </summary>
public Nysiis()
: this(true)
{
}
/// <summary>
/// Create an instance of the <see cref="Nysiis"/> encoder with the specified strict mode:
/// <list type="bullet">
/// <item><term><c>true</c>:</term><description>encoded strings have a maximum length of 6</description></item>
/// <item><term><c>false</c>:</term><description>encoded strings may have arbitrary length</description></item>
/// </list>
/// </summary>
/// <param name="strict">The strict mode.</param>
public Nysiis(bool strict)
{
this.strict = strict;
}
// LUCENENET specific - in .NET we don't need an object overload of Encode(), since strings are sealed anyway.
/// <summary>
/// Encodes a string using the NYSIIS algorithm.
/// </summary>
/// <param name="str">A string object to encode.</param>
/// <returns>A <see cref="Nysiis"/> code corresponding to the string supplied.</returns>
/// <exception cref="ArgumentException">If a character is not mapped.</exception>
public virtual string Encode(string str)
{
return this.GetNysiis(str);
}
/// <summary>
/// Indicates the strict mode for this <see cref="Nysiis"/> encoder.
/// <c>true</c> if the encoder is configured for strict mode, <c>false</c> otherwise.
/// </summary>
public virtual bool IsStrict => this.strict;
/// <summary>
/// Retrieves the NYSIIS code for a given string.
/// </summary>
/// <param name="str">String to encode using the NYSIIS algorithm.</param>
/// <returns>A NYSIIS code for the string supplied.</returns>
public virtual string GetNysiis(string str)
{
if (str == null)
{
return null;
}
// Use the same clean rules as Soundex
str = SoundexUtils.Clean(str);
if (str.Length == 0)
{
return str;
}
// Translate first characters of name:
// MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
str = PAT_MAC.Replace(str, "MCC", 1);
str = PAT_KN.Replace(str, "NN", 1);
str = PAT_K.Replace(str, "C", 1);
str = PAT_PH_PF.Replace(str, "FF", 1);
str = PAT_SCH.Replace(str, "SSS", 1);
// Translate last characters of name:
// EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
str = PAT_EE_IE.Replace(str, "Y", 1);
str = PAT_DT_ETC.Replace(str, "D", 1);
// First character of key = first character of name.
StringBuilder key = new StringBuilder(str.Length);
key.Append(str[0]);
// Transcode remaining characters, incrementing by one character each time
char[] chars = str.ToCharArray();
int len = chars.Length;
for (int i = 1; i < len; i++)
{
char next = i < len - 1 ? chars[i + 1] : SPACE;
char aNext = i < len - 2 ? chars[i + 2] : SPACE;
char[] transcoded = TranscodeRemaining(chars[i - 1], chars[i], next, aNext);
System.Array.Copy(transcoded, 0, chars, i, transcoded.Length);
// only append the current char to the key if it is different from the last one
if (chars[i] != chars[i - 1])
{
key.Append(chars[i]);
}
}
if (key.Length > 1)
{
char lastChar = key[key.Length - 1];
// If last character is S, remove it.
if (lastChar == 'S')
{
//key.deleteCharAt(key.length() - 1);
key.Remove(key.Length - 1, 1);
lastChar = key[key.Length - 1];
}
if (key.Length > 2)
{
char last2Char = key[key.Length - 2];
// If last characters are AY, replace with Y.
if (last2Char == 'A' && lastChar == 'Y')
{
//.key.deleteCharAt(key.length() - 2);
key.Remove(key.Length - 2, 1);
}
}
// If last character is A, remove it.
if (lastChar == 'A')
{
//key.deleteCharAt(key.length() - 1);
key.Remove(key.Length - 1, 1);
}
}
string result = key.ToString();
return this.IsStrict ? result.Substring(0, Math.Min(TRUE_LENGTH, result.Length) - 0) : result;
}
}
}