blob: c4a1d8768eae95682bf7d8a4242d8c97962d2d8e [file] [log] [blame]
// commons-codec version compatibility level: 1.10
using NUnit.Framework;
using System;
using Assert = Lucene.Net.TestFramework.Assert;
namespace Lucene.Net.Analysis.Phonetic.Language
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Tests <see cref="DaitchMokotoffSoundex"/>.
/// <para/>
/// since 1.10
/// </summary>
public class DaitchMokotoffSoundexTest : StringEncoderAbstractTest<DaitchMokotoffSoundex>
{
protected override DaitchMokotoffSoundex CreateStringEncoder()
{
return new DaitchMokotoffSoundex();
}
private string GetSoundex(string source)
{
return StringEncoder.GetSoundex(source);
}
private string Encode(string source)
{
return StringEncoder.Encode(source);
}
[Test]
public void TestAccentedCharacterFolding()
{
Assert.AreEqual("294795", GetSoundex("Straßburg"));
Assert.AreEqual("294795", GetSoundex("Strasburg"));
Assert.AreEqual("095600", GetSoundex("Éregon"));
Assert.AreEqual("095600", GetSoundex("Eregon"));
}
[Test]
public void TestAdjacentCodes()
{
// AKSSOL
// A-KS-S-O-L
// 0-54-4---8 -> wrong
// 0-54-----8 -> correct
Assert.AreEqual("054800", GetSoundex("AKSSOL"));
// GERSCHFELD
// G-E-RS-CH-F-E-L-D
// 5--4/94-5/4-7-8-3 -> wrong
// 5--4/94-5/--7-8-3 -> correct
Assert.AreEqual("547830|545783|594783|594578", GetSoundex("GERSCHFELD"));
}
[Test]
public void TestEncodeBasic()
{
// same as above, but without branching
Assert.AreEqual("097400", Encode("AUERBACH"));
Assert.AreEqual("097400", Encode("OHRBACH"));
Assert.AreEqual("874400", Encode("LIPSHITZ"));
Assert.AreEqual("874400", Encode("LIPPSZYC"));
Assert.AreEqual("876450", Encode("LEWINSKY"));
Assert.AreEqual("876450", Encode("LEVINSKI"));
Assert.AreEqual("486740", Encode("SZLAMAWICZ"));
Assert.AreEqual("486740", Encode("SHLAMOVITZ"));
}
[Test]
public void TestEncodeIgnoreApostrophes()
{
this.CheckEncodingVariations("079600", new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien",
"OBri'en", "OBrie'n", "OBrien'" });
}
/**
* Test data from http://www.myatt.demon.co.uk/sxalg.htm
*
* @throws EncoderException
*/
[Test]
public void TestEncodeIgnoreHyphens()
{
this.CheckEncodingVariations("565463", new String[] { "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH",
"KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-" });
}
[Test]
public void TestEncodeIgnoreTrimmable()
{
Assert.AreEqual("746536", Encode(" \t\n\r Washington \t\n\r "));
Assert.AreEqual("746536", Encode("Washington"));
}
/**
* Examples from http://www.jewishgen.org/infofiles/soundex.html
*/
[Test]
public void TestSoundexBasic()
{
Assert.AreEqual("583600", GetSoundex("GOLDEN"));
Assert.AreEqual("087930", GetSoundex("Alpert"));
Assert.AreEqual("791900", GetSoundex("Breuer"));
Assert.AreEqual("579000", GetSoundex("Haber"));
Assert.AreEqual("665600", GetSoundex("Mannheim"));
Assert.AreEqual("664000", GetSoundex("Mintz"));
Assert.AreEqual("370000", GetSoundex("Topf"));
Assert.AreEqual("586660", GetSoundex("Kleinmann"));
Assert.AreEqual("769600", GetSoundex("Ben Aron"));
Assert.AreEqual("097400|097500", GetSoundex("AUERBACH"));
Assert.AreEqual("097400|097500", GetSoundex("OHRBACH"));
Assert.AreEqual("874400", GetSoundex("LIPSHITZ"));
Assert.AreEqual("874400|874500", GetSoundex("LIPPSZYC"));
Assert.AreEqual("876450", GetSoundex("LEWINSKY"));
Assert.AreEqual("876450", GetSoundex("LEVINSKI"));
Assert.AreEqual("486740", GetSoundex("SZLAMAWICZ"));
Assert.AreEqual("486740", GetSoundex("SHLAMOVITZ"));
}
/**
* Examples from http://www.avotaynu.com/soundex.htm
*/
[Test]
public void TestSoundexBasic2()
{
Assert.AreEqual("467000|567000", GetSoundex("Ceniow"));
Assert.AreEqual("467000", GetSoundex("Tsenyuv"));
Assert.AreEqual("587400|587500", GetSoundex("Holubica"));
Assert.AreEqual("587400", GetSoundex("Golubitsa"));
Assert.AreEqual("746480|794648", GetSoundex("Przemysl"));
Assert.AreEqual("746480", GetSoundex("Pshemeshil"));
Assert.AreEqual("944744|944745|944754|944755|945744|945745|945754|945755", GetSoundex("Rosochowaciec"));
Assert.AreEqual("945744", GetSoundex("Rosokhovatsets"));
}
/**
* Examples from http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
*/
[Test]
public void TestSoundexBasic3()
{
Assert.AreEqual("734000|739400", GetSoundex("Peters"));
Assert.AreEqual("734600|739460", GetSoundex("Peterson"));
Assert.AreEqual("645740", GetSoundex("Moskowitz"));
Assert.AreEqual("645740", GetSoundex("Moskovitz"));
Assert.AreEqual("154600|145460|454600|445460", GetSoundex("Jackson"));
Assert.AreEqual("154654|154645|154644|145465|145464|454654|454645|454644|445465|445464",
GetSoundex("Jackson-Jackson"));
}
[Test]
public void TestSpecialRomanianCharacters()
{
Assert.AreEqual("364000|464000", GetSoundex("ţamas")); // t-cedilla
Assert.AreEqual("364000|464000", GetSoundex("țamas")); // t-comma
}
}
}