| // commons-codec version compatibility level: 1.10 |
| using System; |
| |
| namespace Lucene.Net.Analysis.Phonetic.Language |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a |
| /// general purpose scheme to find word with similar phonemes. |
| /// <para/> |
| /// This class is thread-safe. |
| /// Although not strictly immutable, the <see cref="maxLength"/> field is not actually used. |
| /// </summary> |
| public class Soundex : IStringEncoder |
| { |
| /// <summary> |
| /// The marker character used to indicate a silent (ignored) character. |
| /// These are ignored except when they appear as the first character. |
| /// <para/> |
| /// Note: the <see cref="US_ENGLISH_MAPPING_STRING"/> does not use this mechanism |
| /// because changing it might break existing code. Mappings that don't contain |
| /// a silent marker code are treated as though H and W are silent. |
| /// <para/> |
| /// To override this, use the <see cref="Soundex(string, bool)"/> constructor. |
| /// <para/> |
| /// since 1.11 |
| /// </summary> |
| public static readonly char SILENT_MARKER = '-'; |
| |
| /// <summary> |
| /// This is a default mapping of the 26 letters used in US English. A value of <c>0</c> for a letter position |
| /// means do not encode, but treat as a separator when it occurs between consonants with the same code. |
| /// <para/> |
| /// (This constant is provided as both an implementation convenience and to allow documentation to pick |
| /// up the value for the constant values page.) |
| /// <para/> |
| /// <b>Note that letters H and W are treated specially.</b> |
| /// They are ignored (after the first letter) and don't act as separators |
| /// between consonants with the same code. |
| /// </summary> |
| /// <seealso cref="US_ENGLISH_MAPPING"/> |
| // ABCDEFGHIJKLMNOPQRSTUVWXYZ |
| public static readonly string US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; |
| |
| /// <summary> |
| /// This is a default mapping of the 26 letters used in US English. A value of <c>0</c> for a letter position |
| /// means do not encode. |
| /// </summary> |
| /// <seealso cref="Soundex.Soundex(char[])"/> |
| private static readonly char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.ToCharArray(); |
| |
| /// <summary> |
| /// An instance of Soundex using the US_ENGLISH_MAPPING mapping. |
| /// This treats H and W as silent letters. |
| /// Apart from when they appear as the first letter, they are ignored. |
| /// They don't act as separators between duplicate codes. |
| /// </summary> |
| /// <seealso cref="US_ENGLISH_MAPPING"/> |
| /// <seealso cref="US_ENGLISH_MAPPING_STRING"/> |
| public static readonly Soundex US_ENGLISH = new Soundex(); |
| |
| /// <summary> |
| /// An instance of Soundex using the Simplified Soundex mapping, as described here: |
| /// http://west-penwith.org.uk/misc/soundex.htm |
| /// <para/> |
| /// This treats H and W the same as vowels (AEIOUY). |
| /// Such letters aren't encoded (after the first), but they do |
| /// act as separators when dropping duplicate codes. |
| /// The mapping is otherwise the same as for <see cref="US_ENGLISH"/>. |
| /// <para/> |
| /// since 1.11 |
| /// </summary> |
| public static readonly Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false); |
| |
| /// <summary> |
| /// An instance of Soundex using the mapping as per the Genealogy site: |
| /// http://www.genealogy.com/articles/research/00000060.html |
| /// <para/> |
| /// This treats vowels (AEIOUY), H and W as silent letters. |
| /// Such letters are ignored (after the first) and do not |
| /// act as separators when dropping duplicate codes. |
| /// <para/> |
| /// The codes for consonants are otherwise the same as for |
| /// <see cref="US_ENGLISH_MAPPING_STRING"/> and <see cref="US_ENGLISH_SIMPLIFIED"/>. |
| /// <para/> |
| /// since 1.11 |
| /// </summary> |
| public static readonly Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2"); |
| // ABCDEFGHIJKLMNOPQRSTUVWXYZ |
| |
| /// <summary> |
| /// The maximum length of a Soundex code - Soundex codes are only four characters by definition. |
| /// </summary> |
| [Obsolete("This feature is not needed since the encoding size must be constant. Will be removed in 2.0.")] |
| private int maxLength = 4; |
| |
| /// <summary> |
| /// Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each |
| /// letter is mapped. This implementation contains a default map for US_ENGLISH |
| /// </summary> |
| private readonly char[] soundexMapping; |
| |
| /// <summary> |
| /// Should H and W be treated specially? |
| /// <para/> |
| /// In versions of the code prior to 1.11, |
| /// the code always treated H and W as silent (ignored) letters. |
| /// If this field is false, H and W are no longer special-cased. |
| /// </summary> |
| private readonly bool specialCaseHW; |
| |
| /// <summary> |
| /// Creates an instance using <see cref="US_ENGLISH_MAPPING"/>. |
| /// </summary> |
| /// <seealso cref="Soundex.Soundex(char[])"/> |
| /// <seealso cref="US_ENGLISH_MAPPING"/> |
| public Soundex() |
| { |
| this.soundexMapping = US_ENGLISH_MAPPING; |
| this.specialCaseHW = true; |
| } |
| |
| /// <summary> |
| /// Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized |
| /// mapping for a non-Western character set. |
| /// <para/> |
| /// Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each |
| /// letter is mapped. This implementation contains a default map for <see cref="US_ENGLISH"/>. |
| /// <para/> |
| /// If the mapping contains an instance of <see cref="SILENT_MARKER"/> then H and W are not given special treatment. |
| /// </summary> |
| /// <param name="mapping"> Mapping array to use when finding the corresponding code for a given character.</param> |
| public Soundex(char[] mapping) |
| { |
| this.soundexMapping = new char[mapping.Length]; |
| System.Array.Copy(mapping, 0, this.soundexMapping, 0, mapping.Length); |
| this.specialCaseHW = !HasMarker(this.soundexMapping); |
| } |
| |
| private bool HasMarker(char[] mapping) |
| { |
| foreach (char ch in mapping) |
| { |
| if (ch == SILENT_MARKER) |
| { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /// <summary> |
| /// Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, |
| /// and/or possibly provide an internationalized mapping for a non-Western character set. |
| /// <para/> |
| /// If the mapping contains an instance of <see cref="SILENT_MARKER"/> then H and W are not given special treatment. |
| /// <para/> |
| /// since 1.4 |
| /// </summary> |
| /// <param name="mapping">Mapping string to use when finding the corresponding code for a given character.</param> |
| public Soundex(string mapping) |
| { |
| this.soundexMapping = mapping.ToCharArray(); |
| this.specialCaseHW = !HasMarker(this.soundexMapping); |
| } |
| |
| /// <summary> |
| /// Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, |
| /// and/or possibly provide an internationalized mapping for a non-Western character set. |
| /// <para/> |
| /// since 1.11 |
| /// </summary> |
| /// <param name="mapping">Mapping string to use when finding the corresponding code for a given character.</param> |
| /// <param name="specialCaseHW">if true, then </param> |
| public Soundex(string mapping, bool specialCaseHW) |
| { |
| this.soundexMapping = mapping.ToCharArray(); |
| this.specialCaseHW = specialCaseHW; |
| } |
| |
| /// <summary> |
| /// Encodes the strings and returns the number of characters in the two encoded strings that are the same. This |
| /// return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or |
| /// identical values. |
| /// <para/> |
| /// See: <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS |
| /// T-SQL DIFFERENCE </a> |
| /// <para/> |
| /// since 1.3 |
| /// </summary> |
| /// <param name="s1">A string that will be encoded and compared.</param> |
| /// <param name="s2">A string that will be encoded and compared.</param> |
| /// <returns>The number of characters in the two encoded strings that are the same from 0 to 4.</returns> |
| /// <seealso cref="SoundexUtils.Difference(IStringEncoder, string, string)"/> |
| public virtual int Difference(string s1, string s2) |
| { |
| return SoundexUtils.Difference(this, s1, s2); |
| } |
| |
| // LUCENENET specific - in .NET we don't need an object overload of Encode(), since strings are sealed anyway. |
| |
| /// <summary> |
| /// Encodes a string using the soundex algorithm. |
| /// </summary> |
| /// <param name="str">A string to encode.</param> |
| /// <returns>A Soundex code corresponding to the string supplied.</returns> |
| /// <exception cref="ArgumentException">If a character is not mapped.</exception> |
| public virtual string Encode(string str) |
| { |
| return GetSoundex(str); |
| } |
| |
| /// <summary> |
| /// Gets or Sets the maxLength. Standard Soundex |
| /// </summary> |
| [Obsolete("This feature is not needed since the encoding size must be constant. Will be removed in 2.0.")] |
| public virtual int MaxLength |
| { |
| get => this.maxLength; |
| set => this.maxLength = value; |
| } |
| |
| /// <summary> |
| /// Maps the given upper-case character to its Soundex code. |
| /// </summary> |
| /// <param name="ch">An upper-case character.</param> |
| /// <returns>A Soundex code.</returns> |
| /// <exception cref="ArgumentException">Thrown if <paramref name="ch"/> is not mapped.</exception> |
| private char Map(char ch) |
| { |
| int index = ch - 'A'; |
| if (index < 0 || index >= this.soundexMapping.Length) |
| { |
| throw new ArgumentException("The character is not mapped: " + ch + " (index=" + index + ")"); |
| } |
| return this.soundexMapping[index]; |
| } |
| |
| /// <summary> |
| /// Retrieves the Soundex code for a given string. |
| /// </summary> |
| /// <param name="str">String to encode using the Soundex algorithm.</param> |
| /// <returns>A soundex code for the string supplied.</returns> |
| /// <exception cref="ArgumentException">If a character is not mapped.</exception> |
| public virtual string GetSoundex(string str) |
| { |
| if (str == null) |
| { |
| return null; |
| } |
| str = SoundexUtils.Clean(str); |
| if (str.Length == 0) |
| { |
| return str; |
| } |
| char[] output = { '0', '0', '0', '0' }; |
| int count = 0; |
| char first = str[0]; |
| output[count++] = first; |
| char lastDigit = Map(first); // previous digit |
| for (int i = 1; i < str.Length && count < output.Length; i++) |
| { |
| char ch = str[i]; |
| if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) |
| { // these are ignored completely |
| continue; |
| } |
| char digit = Map(ch); |
| if (digit == SILENT_MARKER) |
| { |
| continue; |
| } |
| if (digit != '0' && digit != lastDigit) |
| { // don't store vowels or repeats |
| output[count++] = digit; |
| } |
| lastDigit = digit; |
| } |
| return new string(output); |
| } |
| } |
| } |