blob: 12844b1ca24e301374191a30e10f8e87482c89ad [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.codec.language;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoderAbstractTest;
import org.junit.Test;
/**
* Tests {@link DoubleMetaphone}.
*
* <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p>
*
* @see "http://www.cuj.com/documents/s=8038/cuj0006philips/"
*/
public class DoubleMetaphoneTest extends StringEncoderAbstractTest<DoubleMetaphone> {
/**
* Test data from http://aspell.net/test/orig/batch0.tab.
*
* "Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org). Verbatim copying
* and distribution of this entire article is permitted in any medium,
* provided this notice is preserved."
*
* Massaged the test data in the array below.
*/
private static final String[][] FIXTURE = { { "Accosinly", "Occasionally" }, {
"Ciculer", "Circler" }, {
"Circue", "Circle" }, {
"Maddness", "Madness" }, {
"Occusionaly", "Occasionally" }, {
"Steffen", "Stephen" }, {
"Thw", "The" }, {
"Unformanlly", "Unfortunately" }, {
"Unfortally", "Unfortunately" }, {
"abilitey", "ability" }, {
"abouy", "about" }, {
"absorbtion", "absorption" }, {
"accidently", "accidentally" }, {
"accomodate", "accommodate" }, {
"acommadate", "accommodate" }, {
"acord", "accord" }, {
"adultry", "adultery" }, {
"aggresive", "aggressive" }, {
"alchohol", "alcohol" }, {
"alchoholic", "alcoholic" }, {
"allieve", "alive" }, {
"alot", "a lot" }, {
"alright", "all right" }, {
"amature", "amateur" }, {
"ambivilant", "ambivalent" }, {
"amification", "amplification" }, {
"amourfous", "amorphous" }, {
"annoint", "anoint" }, {
"annonsment", "announcement" }, {
"annoyting", "anting" }, {
"annuncio", "announce" }, {
"anonomy", "anatomy" }, {
"anotomy", "anatomy" }, {
"antidesestablishmentarianism", "antidisestablishmentarianism" }, {
"antidisestablishmentarism", "antidisestablishmentarianism" }, {
"anynomous", "anonymous" }, {
"appelet", "applet" }, {
"appreceiated", "appreciated" }, {
"appresteate", "appreciate" }, {
"aquantance", "acquaintance" }, {
"aratictature", "architecture" }, {
"archeype", "archetype" }, {
"aricticure", "architecture" }, {
"artic", "arctic" }, {
"asentote", "asymptote" }, {
"ast", "at" }, {
"asterick", "asterisk" }, {
"asymetric", "asymmetric" }, {
"atentively", "attentively" }, {
"autoamlly", "automatically" }, {
"bankrot", "bankrupt" }, {
"basicly", "basically" }, {
"batallion", "battalion" }, {
"bbrose", "browse" }, {
"beauro", "bureau" }, {
"beaurocracy", "bureaucracy" }, {
"beggining", "beginning" }, {
"beging", "beginning" }, {
"behaviour", "behavior" }, {
"beleive", "believe" }, {
"belive", "believe" }, {
"benidifs", "benefits" }, {
"bigginging", "beginning" }, {
"blait", "bleat" }, {
"bouyant", "buoyant" }, {
"boygot", "boycott" }, {
"brocolli", "broccoli" }, {
"buch", "bush" }, {
"buder", "butter" }, {
"budr", "butter" }, {
"budter", "butter" }, {
"buracracy", "bureaucracy" }, {
"burracracy", "bureaucracy" }, {
"buton", "button" }, {
"byby", "by by" }, {
"cauler", "caller" }, {
"ceasar", "caesar" }, {
"cemetary", "cemetery" }, {
"changeing", "changing" }, {
"cheet", "cheat" }, {
"cicle", "circle" }, {
"cimplicity", "simplicity" }, {
"circumstaces", "circumstances" }, {
"clob", "club" }, {
"coaln", "colon" }, {
"cocamena", "cockamamie" }, {
"colleaque", "colleague" }, {
"colloquilism", "colloquialism" }, {
"columne", "column" }, {
"comiler", "compiler" }, {
"comitmment", "commitment" }, {
"comitte", "committee" }, {
"comittmen", "commitment" }, {
"comittmend", "commitment" }, {
"commerciasl", "commercials" }, {
"commited", "committed" }, {
"commitee", "committee" }, {
"companys", "companies" }, {
"compicated", "complicated" }, {
"comupter", "computer" }, {
"concensus", "consensus" }, {
"confusionism", "confucianism" }, {
"congradulations", "congratulations" }, {
"conibation", "contribution" }, {
"consident", "consistent" }, {
"consident", "consonant" }, {
"contast", "constant" }, {
"contastant", "constant" }, {
"contunie", "continue" }, {
"cooly", "coolly" }, {
"copping", "coping" }, {
"cosmoplyton", "cosmopolitan" }, {
"courst", "court" }, {
"crasy", "crazy" }, {
"cravets", "caveats" }, {
"credetability", "credibility" }, {
"criqitue", "critique" }, {
"croke", "croak" }, {
"crucifiction", "crucifixion" }, {
"crusifed", "crucified" }, {
"ctitique", "critique" }, {
"cumba", "combo" }, {
"custamisation", "customization" }, {
"dag", "dog" }, {
"daly", "daily" }, {
"danguages", "dangerous" }, {
"deaft", "draft" }, {
"defence", "defense" }, {
"defenly", "defiantly" }, {
"definate", "definite" }, {
"definately", "definitely" }, {
"dependeble", "dependable" }, {
"descrption", "description" }, {
"descrptn", "description" }, {
"desparate", "desperate" }, {
"dessicate", "desiccate" }, {
"destint", "distant" }, {
"develepment", "developments" }, {
"developement", "development" }, {
"develpond", "development" }, {
"devulge", "divulge" }, {
"diagree", "disagree" }, {
"dieties", "deities" }, {
"dinasaur", "dinosaur" }, {
"dinasour", "dinosaur" }, {
"direcyly", "directly" }, {
"discuess", "discuss" }, {
"disect", "dissect" }, {
"disippate", "dissipate" }, {
"disition", "decision" }, {
"dispair", "despair" }, {
"disssicion", "discussion" }, {
"distarct", "distract" }, {
"distart", "distort" }, {
"distroy", "destroy" }, {
"documtations", "documentation" }, {
"doenload", "download" }, {
"dongle", "dangle" }, {
"doog", "dog" }, {
"dramaticly", "dramatically" }, {
"drunkeness", "drunkenness" }, {
"ductioneery", "dictionary" }, {
"dur", "due" }, {
"duren", "during" }, {
"dymatic", "dynamic" }, {
"dynaic", "dynamic" }, {
"ecstacy", "ecstasy" }, {
"efficat", "efficient" }, {
"efficity", "efficacy" }, {
"effots", "efforts" }, {
"egsistence", "existence" }, {
"eitiology", "etiology" }, {
"elagent", "elegant" }, {
"elligit", "elegant" }, {
"embarass", "embarrass" }, {
"embarassment", "embarrassment" }, {
"embaress", "embarrass" }, {
"encapsualtion", "encapsulation" }, {
"encyclapidia", "encyclopedia" }, {
"encyclopia", "encyclopedia" }, {
"engins", "engine" }, {
"enhence", "enhance" }, {
"enligtment", "Enlightenment" }, {
"ennuui", "ennui" }, {
"enought", "enough" }, {
"enventions", "inventions" }, {
"envireminakl", "environmental" }, {
"enviroment", "environment" }, {
"epitomy", "epitome" }, {
"equire", "acquire" }, {
"errara", "error" }, {
"erro", "error" }, {
"evaualtion", "evaluation" }, {
"evething", "everything" }, {
"evtually", "eventually" }, {
"excede", "exceed" }, {
"excercise", "exercise" }, {
"excpt", "except" }, {
"excution", "execution" }, {
"exhileration", "exhilaration" }, {
"existance", "existence" }, {
"expleyly", "explicitly" }, {
"explity", "explicitly" }, {
"expresso", "espresso" }, {
"exspidient", "expedient" }, {
"extions", "extensions" }, {
"factontion", "factorization" }, {
"failer", "failure" }, {
"famdasy", "fantasy" }, {
"faver", "favor" }, {
"faxe", "fax" }, {
"febuary", "february" }, {
"firey", "fiery" }, {
"fistival", "festival" }, {
"flatterring", "flattering" }, {
"fluk", "flux" }, {
"flukse", "flux" }, {
"fone", "phone" }, {
"forsee", "foresee" }, {
"frustartaion", "frustrating" }, {
"fuction", "function" }, {
"funetik", "phonetic" }, {
"futs", "guts" }, {
"gamne", "came" }, {
"gaurd", "guard" }, {
"generly", "generally" }, {
"ghandi", "gandhi" }, {
"goberment", "government" }, {
"gobernement", "government" }, {
"gobernment", "government" }, {
"gotton", "gotten" }, {
"gracefull", "graceful" }, {
"gradualy", "gradually" }, {
"grammer", "grammar" }, {
"hallo", "hello" }, {
"hapily", "happily" }, {
"harrass", "harass" }, {
"havne", "have" }, {
"heellp", "help" }, {
"heighth", "height" }, {
"hellp", "help" }, {
"helo", "hello" }, {
"herlo", "hello" }, {
"hifin", "hyphen" }, {
"hifine", "hyphen" }, {
"higer", "higher" }, {
"hiphine", "hyphen" }, {
"hippie", "hippy" }, {
"hippopotamous", "hippopotamus" }, {
"hlp", "help" }, {
"hourse", "horse" }, {
"houssing", "housing" }, {
"howaver", "however" }, {
"howver", "however" }, {
"humaniti", "humanity" }, {
"hyfin", "hyphen" }, {
"hypotathes", "hypothesis" }, {
"hypotathese", "hypothesis" }, {
"hystrical", "hysterical" }, {
"ident", "indent" }, {
"illegitament", "illegitimate" }, {
"imbed", "embed" }, {
"imediaetly", "immediately" }, {
"imfamy", "infamy" }, {
"immenant", "immanent" }, {
"implemtes", "implements" }, {
"inadvertant", "inadvertent" }, {
"incase", "in case" }, {
"incedious", "insidious" }, {
"incompleet", "incomplete" }, {
"incomplot", "incomplete" }, {
"inconvenant", "inconvenient" }, {
"inconvience", "inconvenience" }, {
"independant", "independent" }, {
"independenent", "independent" }, {
"indepnends", "independent" }, {
"indepth", "in depth" }, {
"indispensible", "indispensable" }, {
"inefficite", "inefficient" }, {
"inerface", "interface" }, {
"infact", "in fact" }, {
"influencial", "influential" }, {
"inital", "initial" }, {
"initinized", "initialized" }, {
"initized", "initialized" }, {
"innoculate", "inoculate" }, {
"insistant", "insistent" }, {
"insistenet", "insistent" }, {
"instulation", "installation" }, {
"intealignt", "intelligent" }, {
"intejilent", "intelligent" }, {
"intelegent", "intelligent" }, {
"intelegnent", "intelligent" }, {
"intelejent", "intelligent" }, {
"inteligent", "intelligent" }, {
"intelignt", "intelligent" }, {
"intellagant", "intelligent" }, {
"intellegent", "intelligent" }, {
"intellegint", "intelligent" }, {
"intellgnt", "intelligent" }, {
"intensionality", "intensionally" }, {
"interate", "iterate" }, {
"internation", "international" }, {
"interpretate", "interpret" }, {
"interpretter", "interpreter" }, {
"intertes", "interested" }, {
"intertesd", "interested" }, {
"invermeantial", "environmental" }, {
"irregardless", "regardless" }, {
"irresistable", "irresistible" }, {
"irritible", "irritable" }, {
"islams", "muslims" }, {
"isotrop", "isotope" }, {
"isreal", "israel" }, {
"johhn", "john" }, {
"judgement", "judgment" }, {
"kippur", "kipper" }, {
"knawing", "knowing" }, {
"latext", "latest" }, {
"leasve", "leave" }, {
"lesure", "leisure" }, {
"liasion", "lesion" }, {
"liason", "liaison" }, {
"libary", "library" }, {
"likly", "likely" }, {
"lilometer", "kilometer" }, {
"liquify", "liquefy" }, {
"lloyer", "layer" }, {
"lossing", "losing" }, {
"luser", "laser" }, {
"maintanence", "maintenance" }, {
"majaerly", "majority" }, {
"majoraly", "majority" }, {
"maks", "masks" }, {
"mandelbrot", "Mandelbrot" }, {
"mant", "want" }, {
"marshall", "marshal" }, {
"maxium", "maximum" }, {
"meory", "memory" }, {
"metter", "better" }, {
"mic", "mike" }, {
"midia", "media" }, {
"millenium", "millennium" }, {
"miniscule", "minuscule" }, {
"minkay", "monkey" }, {
"minum", "minimum" }, {
"mischievious", "mischievous" }, {
"misilous", "miscellaneous" }, {
"momento", "memento" }, {
"monkay", "monkey" }, {
"mosaik", "mosaic" }, {
"mostlikely", "most likely" }, {
"mousr", "mouser" }, {
"mroe", "more" }, {
"neccessary", "necessary" }, {
"necesary", "necessary" }, {
"necesser", "necessary" }, {
"neice", "niece" }, {
"neighbour", "neighbor" }, {
"nemonic", "pneumonic" }, {
"nevade", "Nevada" }, {
"nickleodeon", "nickelodeon" }, {
"nieve", "naive" }, {
"noone", "no one" }, {
"noticably", "noticeably" }, {
"notin", "not in" }, {
"nozled", "nuzzled" }, {
"objectsion", "objects" }, {
"obsfuscate", "obfuscate" }, {
"ocassion", "occasion" }, {
"occuppied", "occupied" }, {
"occurence", "occurrence" }, {
"octagenarian", "octogenarian" }, {
"olf", "old" }, {
"opposim", "opossum" }, {
"organise", "organize" }, {
"organiz", "organize" }, {
"orientate", "orient" }, {
"oscilascope", "oscilloscope" }, {
"oving", "moving" }, {
"paramers", "parameters" }, {
"parametic", "parameter" }, {
"paranets", "parameters" }, {
"partrucal", "particular" }, {
"pataphysical", "metaphysical" }, {
"patten", "pattern" }, {
"permissable", "permissible" }, {
"permition", "permission" }, {
"permmasivie", "permissive" }, {
"perogative", "prerogative" }, {
"persue", "pursue" }, {
"phantasia", "fantasia" }, {
"phenominal", "phenomenal" }, {
"picaresque", "picturesque" }, {
"playwrite", "playwright" }, {
"poeses", "poesies" }, {
"polation", "politician" }, {
"poligamy", "polygamy" }, {
"politict", "politic" }, {
"pollice", "police" }, {
"polypropalene", "polypropylene" }, {
"pompom", "pompon" }, {
"possable", "possible" }, {
"practicle", "practical" }, {
"pragmaticism", "pragmatism" }, {
"preceeding", "preceding" }, {
"precion", "precision" }, {
"precios", "precision" }, {
"preemptory", "peremptory" }, {
"prefices", "prefixes" }, {
"prefixt", "prefixed" }, {
"presbyterian", "Presbyterian" }, {
"presue", "pursue" }, {
"presued", "pursued" }, {
"privielage", "privilege" }, {
"priviledge", "privilege" }, {
"proceedures", "procedures" }, {
"pronensiation", "pronunciation" }, {
"pronisation", "pronunciation" }, {
"pronounciation", "pronunciation" }, {
"properally", "properly" }, {
"proplematic", "problematic" }, {
"protray", "portray" }, {
"pscolgst", "psychologist" }, {
"psicolagest", "psychologist" }, {
"psycolagest", "psychologist" }, {
"quoz", "quiz" }, {
"radious", "radius" }, {
"ramplily", "rampantly" }, {
"reccomend", "recommend" }, {
"reccona", "raccoon" }, {
"recieve", "receive" }, {
"reconise", "recognize" }, {
"rectangeles", "rectangle" }, {
"redign", "redesign" }, {
"reoccurring", "recurring" }, {
"repitition", "repetition" }, {
"replasments", "replacement" }, {
"reposable", "responsible" }, {
"reseblence", "resemblance" }, {
"respct", "respect" }, {
"respecally", "respectfully" }, {
"roon", "room" }, {
"rought", "roughly" }, {
"rsx", "RSX" }, {
"rudemtry", "rudimentary" }, {
"runnung", "running" }, {
"sacreligious", "sacrilegious" }, {
"saftly", "safely" }, {
"salut", "salute" }, {
"satifly", "satisfy" }, {
"scrabdle", "scrabble" }, {
"searcheable", "searchable" }, {
"secion", "section" }, {
"seferal", "several" }, {
"segements", "segments" }, {
"sence", "sense" }, {
"seperate", "separate" }, {
"sherbert", "sherbet" }, {
"sicolagest", "psychologist" }, {
"sieze", "seize" }, {
"simpfilty", "simplicity" }, {
"simplye", "simply" }, {
"singal", "signal" }, {
"sitte", "site" }, {
"situration", "situation" }, {
"slyph", "sylph" }, {
"smil", "smile" }, {
"snuck", "sneaked" }, {
"sometmes", "sometimes" }, {
"soonec", "sonic" }, {
"specificialy", "specifically" }, {
"spel", "spell" }, {
"spoak", "spoke" }, {
"sponsered", "sponsored" }, {
"stering", "steering" }, {
"straightjacket", "straitjacket" }, {
"stumach", "stomach" }, {
"stutent", "student" }, {
"styleguide", "style guide" }, {
"subisitions", "substitutions" }, {
"subjecribed", "subscribed" }, {
"subpena", "subpoena" }, {
"substations", "substitutions" }, {
"suger", "sugar" }, {
"supercede", "supersede" }, {
"superfulous", "superfluous" }, {
"susan", "Susan" }, {
"swimwear", "swim wear" }, {
"syncorization", "synchronization" }, {
"taff", "tough" }, {
"taht", "that" }, {
"tattos", "tattoos" }, {
"techniquely", "technically" }, {
"teh", "the" }, {
"tem", "team" }, {
"teo", "two" }, {
"teridical", "theoretical" }, {
"tesst", "test" }, {
"tets", "tests" }, {
"thanot", "than or" }, {
"theirselves", "themselves" }, {
"theridically", "theoretical" }, {
"thredically", "theoretically" }, {
"thruout", "throughout" }, {
"ths", "this" }, {
"titalate", "titillate" }, {
"tobagan", "tobaggon" }, {
"tommorrow", "tomorrow" }, {
"tomorow", "tomorrow" }, {
"tradegy", "tragedy" }, {
"trubbel", "trouble" }, {
"ttest", "test" }, {
"tunnellike", "tunnel like" }, {
"tured", "turned" }, {
"tyrrany", "tyranny" }, {
"unatourral", "unnatural" }, {
"unaturral", "unnatural" }, {
"unconisitional", "unconstitutional" }, {
"unconscience", "unconscious" }, {
"underladder", "under ladder" }, {
"unentelegible", "unintelligible" }, {
"unfortunently", "unfortunately" }, {
"unnaturral", "unnatural" }, {
"upcast", "up cast" }, {
"upmost", "utmost" }, {
"uranisium", "uranium" }, {
"verison", "version" }, {
"vinagarette", "vinaigrette" }, {
"volumptuous", "voluptuous" }, {
"volunteerism", "voluntarism" }, {
"volye", "volley" }, {
"wadting", "wasting" }, {
"waite", "wait" }, {
"wan't", "won't" }, {
"warloord", "warlord" }, {
"whaaat", "what" }, {
"whard", "ward" }, {
"whimp", "wimp" }, {
"wicken", "weaken" }, {
"wierd", "weird" }, {
"wrank", "rank" }, {
"writeen", "righten" }, {
"writting", "writing" }, {
"wundeews", "windows" }, {
"yeild", "yield" }, {
"youe", "your" }
};
/**
* A subset of FIXTURE generated by this test.
*/
private static final String[][] MATCHES = { { "Accosinly", "Occasionally" }, {
"Maddness", "Madness" }, {
"Occusionaly", "Occasionally" }, {
"Steffen", "Stephen" }, {
"Thw", "The" }, {
"Unformanlly", "Unfortunately" }, {
"Unfortally", "Unfortunately" }, {
"abilitey", "ability" }, {
"absorbtion", "absorption" }, {
"accidently", "accidentally" }, {
"accomodate", "accommodate" }, {
"acommadate", "accommodate" }, {
"acord", "accord" }, {
"adultry", "adultery" }, {
"aggresive", "aggressive" }, {
"alchohol", "alcohol" }, {
"alchoholic", "alcoholic" }, {
"allieve", "alive" }, {
"alot", "a lot" }, {
"alright", "all right" }, {
"amature", "amateur" }, {
"ambivilant", "ambivalent" }, {
"amourfous", "amorphous" }, {
"annoint", "anoint" }, {
"annonsment", "announcement" }, {
"annoyting", "anting" }, {
"annuncio", "announce" }, {
"anotomy", "anatomy" }, {
"antidesestablishmentarianism", "antidisestablishmentarianism" }, {
"antidisestablishmentarism", "antidisestablishmentarianism" }, {
"anynomous", "anonymous" }, {
"appelet", "applet" }, {
"appreceiated", "appreciated" }, {
"appresteate", "appreciate" }, {
"aquantance", "acquaintance" }, {
"aricticure", "architecture" }, {
"asterick", "asterisk" }, {
"asymetric", "asymmetric" }, {
"atentively", "attentively" }, {
"bankrot", "bankrupt" }, {
"basicly", "basically" }, {
"batallion", "battalion" }, {
"bbrose", "browse" }, {
"beauro", "bureau" }, {
"beaurocracy", "bureaucracy" }, {
"beggining", "beginning" }, {
"behaviour", "behavior" }, {
"beleive", "believe" }, {
"belive", "believe" }, {
"blait", "bleat" }, {
"bouyant", "buoyant" }, {
"boygot", "boycott" }, {
"brocolli", "broccoli" }, {
"buder", "butter" }, {
"budr", "butter" }, {
"budter", "butter" }, {
"buracracy", "bureaucracy" }, {
"burracracy", "bureaucracy" }, {
"buton", "button" }, {
"byby", "by by" }, {
"cauler", "caller" }, {
"ceasar", "caesar" }, {
"cemetary", "cemetery" }, {
"changeing", "changing" }, {
"cheet", "cheat" }, {
"cimplicity", "simplicity" }, {
"circumstaces", "circumstances" }, {
"clob", "club" }, {
"coaln", "colon" }, {
"colleaque", "colleague" }, {
"colloquilism", "colloquialism" }, {
"columne", "column" }, {
"comitmment", "commitment" }, {
"comitte", "committee" }, {
"comittmen", "commitment" }, {
"comittmend", "commitment" }, {
"commerciasl", "commercials" }, {
"commited", "committed" }, {
"commitee", "committee" }, {
"companys", "companies" }, {
"comupter", "computer" }, {
"concensus", "consensus" }, {
"confusionism", "confucianism" }, {
"congradulations", "congratulations" }, {
"contunie", "continue" }, {
"cooly", "coolly" }, {
"copping", "coping" }, {
"cosmoplyton", "cosmopolitan" }, {
"crasy", "crazy" }, {
"croke", "croak" }, {
"crucifiction", "crucifixion" }, {
"crusifed", "crucified" }, {
"cumba", "combo" }, {
"custamisation", "customization" }, {
"dag", "dog" }, {
"daly", "daily" }, {
"defence", "defense" }, {
"definate", "definite" }, {
"definately", "definitely" }, {
"dependeble", "dependable" }, {
"descrption", "description" }, {
"descrptn", "description" }, {
"desparate", "desperate" }, {
"dessicate", "desiccate" }, {
"destint", "distant" }, {
"develepment", "developments" }, {
"developement", "development" }, {
"develpond", "development" }, {
"devulge", "divulge" }, {
"dieties", "deities" }, {
"dinasaur", "dinosaur" }, {
"dinasour", "dinosaur" }, {
"discuess", "discuss" }, {
"disect", "dissect" }, {
"disippate", "dissipate" }, {
"disition", "decision" }, {
"dispair", "despair" }, {
"distarct", "distract" }, {
"distart", "distort" }, {
"distroy", "destroy" }, {
"doenload", "download" }, {
"dongle", "dangle" }, {
"doog", "dog" }, {
"dramaticly", "dramatically" }, {
"drunkeness", "drunkenness" }, {
"ductioneery", "dictionary" }, {
"ecstacy", "ecstasy" }, {
"egsistence", "existence" }, {
"eitiology", "etiology" }, {
"elagent", "elegant" }, {
"embarass", "embarrass" }, {
"embarassment", "embarrassment" }, {
"embaress", "embarrass" }, {
"encapsualtion", "encapsulation" }, {
"encyclapidia", "encyclopedia" }, {
"encyclopia", "encyclopedia" }, {
"engins", "engine" }, {
"enhence", "enhance" }, {
"ennuui", "ennui" }, {
"enventions", "inventions" }, {
"envireminakl", "environmental" }, {
"enviroment", "environment" }, {
"epitomy", "epitome" }, {
"equire", "acquire" }, {
"errara", "error" }, {
"evaualtion", "evaluation" }, {
"excede", "exceed" }, {
"excercise", "exercise" }, {
"excpt", "except" }, {
"exhileration", "exhilaration" }, {
"existance", "existence" }, {
"expleyly", "explicitly" }, {
"explity", "explicitly" }, {
"failer", "failure" }, {
"faver", "favor" }, {
"faxe", "fax" }, {
"firey", "fiery" }, {
"fistival", "festival" }, {
"flatterring", "flattering" }, {
"flukse", "flux" }, {
"fone", "phone" }, {
"forsee", "foresee" }, {
"frustartaion", "frustrating" }, {
"funetik", "phonetic" }, {
"gaurd", "guard" }, {
"generly", "generally" }, {
"ghandi", "gandhi" }, {
"gotton", "gotten" }, {
"gracefull", "graceful" }, {
"gradualy", "gradually" }, {
"grammer", "grammar" }, {
"hallo", "hello" }, {
"hapily", "happily" }, {
"harrass", "harass" }, {
"heellp", "help" }, {
"heighth", "height" }, {
"hellp", "help" }, {
"helo", "hello" }, {
"hifin", "hyphen" }, {
"hifine", "hyphen" }, {
"hiphine", "hyphen" }, {
"hippie", "hippy" }, {
"hippopotamous", "hippopotamus" }, {
"hourse", "horse" }, {
"houssing", "housing" }, {
"howaver", "however" }, {
"howver", "however" }, {
"humaniti", "humanity" }, {
"hyfin", "hyphen" }, {
"hystrical", "hysterical" }, {
"illegitament", "illegitimate" }, {
"imbed", "embed" }, {
"imediaetly", "immediately" }, {
"immenant", "immanent" }, {
"implemtes", "implements" }, {
"inadvertant", "inadvertent" }, {
"incase", "in case" }, {
"incedious", "insidious" }, {
"incompleet", "incomplete" }, {
"incomplot", "incomplete" }, {
"inconvenant", "inconvenient" }, {
"inconvience", "inconvenience" }, {
"independant", "independent" }, {
"independenent", "independent" }, {
"indepnends", "independent" }, {
"indepth", "in depth" }, {
"indispensible", "indispensable" }, {
"inefficite", "inefficient" }, {
"infact", "in fact" }, {
"influencial", "influential" }, {
"innoculate", "inoculate" }, {
"insistant", "insistent" }, {
"insistenet", "insistent" }, {
"instulation", "installation" }, {
"intealignt", "intelligent" }, {
"intelegent", "intelligent" }, {
"intelegnent", "intelligent" }, {
"intelejent", "intelligent" }, {
"inteligent", "intelligent" }, {
"intelignt", "intelligent" }, {
"intellagant", "intelligent" }, {
"intellegent", "intelligent" }, {
"intellegint", "intelligent" }, {
"intellgnt", "intelligent" }, {
"intensionality", "intensionally" }, {
"internation", "international" }, {
"interpretate", "interpret" }, {
"interpretter", "interpreter" }, {
"intertes", "interested" }, {
"intertesd", "interested" }, {
"invermeantial", "environmental" }, {
"irresistable", "irresistible" }, {
"irritible", "irritable" }, {
"isreal", "israel" }, {
"johhn", "john" }, {
"kippur", "kipper" }, {
"knawing", "knowing" }, {
"lesure", "leisure" }, {
"liasion", "lesion" }, {
"liason", "liaison" }, {
"likly", "likely" }, {
"liquify", "liquefy" }, {
"lloyer", "layer" }, {
"lossing", "losing" }, {
"luser", "laser" }, {
"maintanence", "maintenance" }, {
"mandelbrot", "Mandelbrot" }, {
"marshall", "marshal" }, {
"maxium", "maximum" }, {
"mic", "mike" }, {
"midia", "media" }, {
"millenium", "millennium" }, {
"miniscule", "minuscule" }, {
"minkay", "monkey" }, {
"mischievious", "mischievous" }, {
"momento", "memento" }, {
"monkay", "monkey" }, {
"mosaik", "mosaic" }, {
"mostlikely", "most likely" }, {
"mousr", "mouser" }, {
"mroe", "more" }, {
"necesary", "necessary" }, {
"necesser", "necessary" }, {
"neice", "niece" }, {
"neighbour", "neighbor" }, {
"nemonic", "pneumonic" }, {
"nevade", "Nevada" }, {
"nickleodeon", "nickelodeon" }, {
"nieve", "naive" }, {
"noone", "no one" }, {
"notin", "not in" }, {
"nozled", "nuzzled" }, {
"objectsion", "objects" }, {
"ocassion", "occasion" }, {
"occuppied", "occupied" }, {
"occurence", "occurrence" }, {
"octagenarian", "octogenarian" }, {
"opposim", "opossum" }, {
"organise", "organize" }, {
"organiz", "organize" }, {
"orientate", "orient" }, {
"oscilascope", "oscilloscope" }, {
"parametic", "parameter" }, {
"permissable", "permissible" }, {
"permmasivie", "permissive" }, {
"persue", "pursue" }, {
"phantasia", "fantasia" }, {
"phenominal", "phenomenal" }, {
"playwrite", "playwright" }, {
"poeses", "poesies" }, {
"poligamy", "polygamy" }, {
"politict", "politic" }, {
"pollice", "police" }, {
"polypropalene", "polypropylene" }, {
"possable", "possible" }, {
"practicle", "practical" }, {
"pragmaticism", "pragmatism" }, {
"preceeding", "preceding" }, {
"precios", "precision" }, {
"preemptory", "peremptory" }, {
"prefixt", "prefixed" }, {
"presbyterian", "Presbyterian" }, {
"presue", "pursue" }, {
"presued", "pursued" }, {
"privielage", "privilege" }, {
"priviledge", "privilege" }, {
"proceedures", "procedures" }, {
"pronensiation", "pronunciation" }, {
"pronounciation", "pronunciation" }, {
"properally", "properly" }, {
"proplematic", "problematic" }, {
"protray", "portray" }, {
"pscolgst", "psychologist" }, {
"psicolagest", "psychologist" }, {
"psycolagest", "psychologist" }, {
"quoz", "quiz" }, {
"radious", "radius" }, {
"reccomend", "recommend" }, {
"reccona", "raccoon" }, {
"recieve", "receive" }, {
"reconise", "recognize" }, {
"rectangeles", "rectangle" }, {
"reoccurring", "recurring" }, {
"repitition", "repetition" }, {
"replasments", "replacement" }, {
"respct", "respect" }, {
"respecally", "respectfully" }, {
"rsx", "RSX" }, {
"runnung", "running" }, {
"sacreligious", "sacrilegious" }, {
"salut", "salute" }, {
"searcheable", "searchable" }, {
"seferal", "several" }, {
"segements", "segments" }, {
"sence", "sense" }, {
"seperate", "separate" }, {
"sicolagest", "psychologist" }, {
"sieze", "seize" }, {
"simplye", "simply" }, {
"sitte", "site" }, {
"slyph", "sylph" }, {
"smil", "smile" }, {
"sometmes", "sometimes" }, {
"soonec", "sonic" }, {
"specificialy", "specifically" }, {
"spel", "spell" }, {
"spoak", "spoke" }, {
"sponsered", "sponsored" }, {
"stering", "steering" }, {
"straightjacket", "straitjacket" }, {
"stumach", "stomach" }, {
"stutent", "student" }, {
"styleguide", "style guide" }, {
"subpena", "subpoena" }, {
"substations", "substitutions" }, {
"supercede", "supersede" }, {
"superfulous", "superfluous" }, {
"susan", "Susan" }, {
"swimwear", "swim wear" }, {
"syncorization", "synchronization" }, {
"taff", "tough" }, {
"taht", "that" }, {
"tattos", "tattoos" }, {
"techniquely", "technically" }, {
"teh", "the" }, {
"tem", "team" }, {
"teo", "two" }, {
"teridical", "theoretical" }, {
"tesst", "test" }, {
"theridically", "theoretical" }, {
"thredically", "theoretically" }, {
"thruout", "throughout" }, {
"ths", "this" }, {
"titalate", "titillate" }, {
"tobagan", "tobaggon" }, {
"tommorrow", "tomorrow" }, {
"tomorow", "tomorrow" }, {
"trubbel", "trouble" }, {
"ttest", "test" }, {
"tyrrany", "tyranny" }, {
"unatourral", "unnatural" }, {
"unaturral", "unnatural" }, {
"unconisitional", "unconstitutional" }, {
"unconscience", "unconscious" }, {
"underladder", "under ladder" }, {
"unentelegible", "unintelligible" }, {
"unfortunently", "unfortunately" }, {
"unnaturral", "unnatural" }, {
"upcast", "up cast" }, {
"verison", "version" }, {
"vinagarette", "vinaigrette" }, {
"volunteerism", "voluntarism" }, {
"volye", "volley" }, {
"waite", "wait" }, {
"wan't", "won't" }, {
"warloord", "warlord" }, {
"whaaat", "what" }, {
"whard", "ward" }, {
"whimp", "wimp" }, {
"wicken", "weaken" }, {
"wierd", "weird" }, {
"wrank", "rank" }, {
"writeen", "righten" }, {
"writting", "writing" }, {
"wundeews", "windows" }, {
"yeild", "yield" }, };
/**
* Tests encoding APIs in one place.
*/
private void assertDoubleMetaphone(final String expected, final String source) {
assertEquals(expected, this.getStringEncoder().encode(source));
try {
assertEquals(expected, this.getStringEncoder().encode((Object) source));
} catch (final EncoderException e) {
fail("Unexpected expection: " + e);
}
assertEquals(expected, this.getStringEncoder().doubleMetaphone(source));
assertEquals(expected, this.getStringEncoder().doubleMetaphone(source, false));
}
/**
* Tests encoding APIs in one place.
*/
public void assertDoubleMetaphoneAlt(final String expected, final String source) {
assertEquals(expected, this.getStringEncoder().doubleMetaphone(source, true));
}
@Override
protected DoubleMetaphone createStringEncoder() {
return new DoubleMetaphone();
}
public void doubleMetaphoneEqualTest(final String[][] pairs, final boolean useAlternate) {
this.validateFixture(pairs);
for (final String[] pair : pairs) {
final String name0 = pair[0];
final String name1 = pair[1];
final String failMsg = "Expected match between " + name0 + " and " + name1 + " (use alternate: " + useAlternate + ")";
assertTrue(failMsg, this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, useAlternate));
assertTrue(failMsg, this.getStringEncoder().isDoubleMetaphoneEqual(name1, name0, useAlternate));
if (!useAlternate) {
assertTrue(failMsg, this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1));
assertTrue(failMsg, this.getStringEncoder().isDoubleMetaphoneEqual(name1, name0));
}
}
}
public void doubleMetaphoneNotEqualTest(final boolean alternate) {
assertFalse(this.getStringEncoder().isDoubleMetaphoneEqual("Brain", "Band", alternate));
assertFalse(this.getStringEncoder().isDoubleMetaphoneEqual("Band", "Brain", alternate));
if (!alternate) {
assertFalse(this.getStringEncoder().isDoubleMetaphoneEqual("Brain", "Band"));
assertFalse(this.getStringEncoder().isDoubleMetaphoneEqual("Band", "Brain"));
}
}
@Test
public void testCCedilla() {
assertTrue(this.getStringEncoder().isDoubleMetaphoneEqual("\u00e7", "S")); // c-cedilla
}
@Test
public void testCodec184() throws Throwable {
assertTrue(new DoubleMetaphone().isDoubleMetaphoneEqual("", "", false));
assertTrue(new DoubleMetaphone().isDoubleMetaphoneEqual("", "", true));
assertFalse(new DoubleMetaphone().isDoubleMetaphoneEqual("aa", "", false));
assertFalse(new DoubleMetaphone().isDoubleMetaphoneEqual("aa", "", true));
assertFalse(new DoubleMetaphone().isDoubleMetaphoneEqual("", "aa", false));
assertFalse(new DoubleMetaphone().isDoubleMetaphoneEqual("", "aa", true));
}
@Test
public void testDoubleMetaphone() {
assertDoubleMetaphone("TSTN", "testing");
assertDoubleMetaphone("0", "The");
assertDoubleMetaphone("KK", "quick");
assertDoubleMetaphone("PRN", "brown");
assertDoubleMetaphone("FKS", "fox");
assertDoubleMetaphone("JMPT", "jumped");
assertDoubleMetaphone("AFR", "over");
assertDoubleMetaphone("0", "the");
assertDoubleMetaphone("LS", "lazy");
assertDoubleMetaphone("TKS", "dogs");
assertDoubleMetaphone("MKFR", "MacCafferey");
assertDoubleMetaphone("STFN", "Stephan");
assertDoubleMetaphone("KSSK", "Kuczewski");
assertDoubleMetaphone("MKLL", "McClelland");
assertDoubleMetaphone("SNHS", "san jose");
assertDoubleMetaphone("SNFP", "xenophobia");
assertDoubleMetaphoneAlt("TSTN", "testing");
assertDoubleMetaphoneAlt("T", "The");
assertDoubleMetaphoneAlt("KK", "quick");
assertDoubleMetaphoneAlt("PRN", "brown");
assertDoubleMetaphoneAlt("FKS", "fox");
assertDoubleMetaphoneAlt("AMPT", "jumped");
assertDoubleMetaphoneAlt("AFR", "over");
assertDoubleMetaphoneAlt("T", "the");
assertDoubleMetaphoneAlt("LS", "lazy");
assertDoubleMetaphoneAlt("TKS", "dogs");
assertDoubleMetaphoneAlt("MKFR", "MacCafferey");
assertDoubleMetaphoneAlt("STFN", "Stephan");
assertDoubleMetaphoneAlt("KXFS", "Kutchefski");
assertDoubleMetaphoneAlt("MKLL", "McClelland");
assertDoubleMetaphoneAlt("SNHS", "san jose");
assertDoubleMetaphoneAlt("SNFP", "xenophobia");
assertDoubleMetaphoneAlt("FKR", "Fokker");
assertDoubleMetaphoneAlt("AK", "Joqqi");
assertDoubleMetaphoneAlt("HF", "Hovvi");
assertDoubleMetaphoneAlt("XRN", "Czerny");
}
@Test
public void testEmpty() {
assertEquals(null, this.getStringEncoder().doubleMetaphone(null));
assertEquals(null, this.getStringEncoder().doubleMetaphone(""));
assertEquals(null, this.getStringEncoder().doubleMetaphone(" "));
assertEquals(null, this.getStringEncoder().doubleMetaphone("\t\n\r "));
}
@Test
public void testIsDoubleMetaphoneEqualBasic() {
final String[][] testFixture = new String[][] { {
"", "" }, {
"Case", "case" }, {
"CASE", "Case" }, {
"caSe", "cAsE" }, {
"cookie", "quick" }, {
"quick", "cookie" }, {
"Brian", "Bryan" }, {
"Auto", "Otto" }, {
"Steven", "Stefan" }, {
"Philipowitz", "Filipowicz" }
};
doubleMetaphoneEqualTest(testFixture, false);
doubleMetaphoneEqualTest(testFixture, true);
}
/**
* Example in the original article but failures in this Java impl:
*/
@Test
public void testIsDoubleMetaphoneEqualExtended1() {
// String[][] testFixture = new String[][] { { "Smith", "Schmidt" }
// };
// doubleMetaphoneEqualTest(testFixture, false);
// doubleMetaphoneEqualTest(testFixture, true);
}
@Test
public void testIsDoubleMetaphoneEqualExtended2() {
final String[][] testFixture = new String[][] { { "Jablonski", "Yablonsky" }
};
//doubleMetaphoneEqualTest(testFixture, false);
doubleMetaphoneEqualTest(testFixture, true);
}
/**
* Used to generate the MATCHES array and test possible matches from the
* FIXTURE array.
*/
@Test
public void testIsDoubleMetaphoneEqualExtended3() {
this.validateFixture(FIXTURE);
final StringBuilder failures = new StringBuilder();
final StringBuilder matches = new StringBuilder();
final String cr = System.lineSeparator();
matches.append("private static final String[][] MATCHES = {" + cr);
int failCount = 0;
for (int i = 0; i < FIXTURE.length; i++) {
final String name0 = FIXTURE[i][0];
final String name1 = FIXTURE[i][1];
final boolean match1 = this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, false);
final boolean match2 = this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, true);
if (match1 == false && match2 == false) {
final String failMsg = "[" + i + "] " + name0 + " and " + name1 + cr;
failures.append(failMsg);
failCount++;
} else {
matches.append("{\"" + name0 + "\", \"" + name1 + "\"}," + cr);
}
}
matches.append("};");
// Turn on to print a new MATCH array
//System.out.println(matches.toString());
if (failCount > 0) {
// Turn on to see which pairs do NOT match.
// String msg = failures.toString();
//fail(failCount + " failures out of " + FIXTURE.length + ". The
// following could be made to match: " + cr + msg);
}
}
@Test
public void testIsDoubleMetaphoneEqualWithMATCHES() {
this.validateFixture(MATCHES);
for (int i = 0; i < MATCHES.length; i++) {
final String name0 = MATCHES[i][0];
final String name1 = MATCHES[i][1];
final boolean match1 = this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, false);
final boolean match2 = this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, true);
if (match1 == false && match2 == false) {
fail("Expected match [" + i + "] " + name0 + " and " + name1);
}
}
}
@Test
public void testIsDoubleMetaphoneNotEqual() {
doubleMetaphoneNotEqualTest(false);
doubleMetaphoneNotEqualTest(true);
}
@Test
public void testNTilde() {
assertTrue(this.getStringEncoder().isDoubleMetaphoneEqual("\u00f1", "N")); // n-tilde
}
/**
* Test setting maximum length
*/
@Test
public void testSetMaxCodeLength() {
final String value = "jumped";
final DoubleMetaphone doubleMetaphone = new DoubleMetaphone();
// Sanity check of default settings
assertEquals("Default Max Code Length", 4, doubleMetaphone.getMaxCodeLen());
assertEquals("Default Primary", "JMPT", doubleMetaphone.doubleMetaphone(value, false));
assertEquals("Default Alternate", "AMPT", doubleMetaphone.doubleMetaphone(value, true));
// Check setting Max Code Length
doubleMetaphone.setMaxCodeLen(3);
assertEquals("Set Max Code Length", 3, doubleMetaphone.getMaxCodeLen());
assertEquals("Max=3 Primary", "JMP", doubleMetaphone.doubleMetaphone(value, false));
assertEquals("Max=3 Alternate", "AMP", doubleMetaphone.doubleMetaphone(value, true));
}
public void validateFixture(final String[][] pairs) {
if (pairs.length == 0) {
fail("Test fixture is empty");
}
for (int i = 0; i < pairs.length; i++) {
if (pairs[i].length != 2) {
fail("Error in test fixture in the data array at index " + i);
}
}
}
}