| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.commons.codec.language.bm; |
| |
| import org.apache.commons.codec.EncoderException; |
| import org.apache.commons.codec.StringEncoder; |
| |
| /** |
| * Encodes strings into their Beider-Morse phonetic encoding. |
| * <p> |
| * Beider-Morse phonetic encodings are optimised for family names. However, they may be useful for a wide range of |
| * words. |
| * <p> |
| * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable, |
| * and may not be thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine} |
| * directly. |
| * <p> |
| * <b>Encoding overview</b> |
| * <p> |
| * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what |
| * language the word comes from. For example, if it ends in "{@code ault}" then it infers that the word is French. |
| * Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of |
| * letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at |
| * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly, this |
| * language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking into |
| * account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic |
| * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be |
| * pronounced in several ways in the source language have only one way to represent them in this average phonetic |
| * language, so the result is again a set of phonetic spellings. |
| * <p> |
| * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In |
| * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding. |
| * Secondly, some names have standard prefixes, for example, "{@code Mac/Mc}" in Scottish (English) names. As |
| * sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once |
| * with the prefix and once without it. The resulting encoding contains one and then the other result. |
| * <p> |
| * <b>Encoding format</b> |
| * <p> |
| * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there |
| * are multiple possible phonetic representations, these are joined with a pipe ({@code |}) character. If multiple |
| * hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed in elipses and |
| * these blocks are then joined with hyphens. For example, "{@code d'ortley}" has a possible prefix. The form |
| * without prefix encodes to "{@code ortlaj|ortlej}", while the form with prefix encodes to " |
| * {@code dortlaj|dortlej}". Thus, the full, combined encoding is "{@code (ortlaj|ortlej)-(dortlaj|dortlej)}". |
| * <p> |
| * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many |
| * potential phonetic interpretations. For example, "{@code Renault}" encodes to " |
| * {@code rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult}". The {@code APPROX} rules will tend to produce larger |
| * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word. |
| * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by |
| * splitting on pipe ({@code |}) and indexing under each of these alternatives. |
| * <p> |
| * <b>Note</b>: this version of the Beider-Morse encoding is equivalent with v3.4 of the reference implementation. |
| * </p> |
| * @see <a href="http://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a> |
| * @see <a href="http://stevemorse.org/phoneticinfo.htm">Reference implementation</a> |
| * |
| * <p> |
| * This class is Not ThreadSafe |
| * </p> |
| * @since 1.6 |
| */ |
| public class BeiderMorseEncoder implements StringEncoder { |
| // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration |
| // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding. |
| |
| // a cached object |
| private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true); |
| |
| @Override |
| public Object encode(final Object source) throws EncoderException { |
| if (!(source instanceof String)) { |
| throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String"); |
| } |
| return encode((String) source); |
| } |
| |
| @Override |
| public String encode(final String source) throws EncoderException { |
| if (source == null) { |
| return null; |
| } |
| return this.engine.encode(source); |
| } |
| |
| /** |
| * Gets the name type currently in operation. |
| * |
| * @return the NameType currently being used |
| */ |
| public NameType getNameType() { |
| return this.engine.getNameType(); |
| } |
| |
| /** |
| * Gets the rule type currently in operation. |
| * |
| * @return the RuleType currently being used |
| */ |
| public RuleType getRuleType() { |
| return this.engine.getRuleType(); |
| } |
| |
| /** |
| * Discovers if multiple possible encodings are concatenated. |
| * |
| * @return true if multiple encodings are concatenated, false if just the first one is returned |
| */ |
| public boolean isConcat() { |
| return this.engine.isConcat(); |
| } |
| |
| /** |
| * Sets how multiple possible phonetic encodings are combined. |
| * |
| * @param concat |
| * true if multiple encodings are to be combined with a '|', false if just the first one is |
| * to be considered |
| */ |
| public void setConcat(final boolean concat) { |
| this.engine = new PhoneticEngine(this.engine.getNameType(), |
| this.engine.getRuleType(), |
| concat, |
| this.engine.getMaxPhonemes()); |
| } |
| |
| /** |
| * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings |
| * optimized for Ashkenazi or Sephardic Jewish family names. |
| * |
| * @param nameType |
| * the NameType in use |
| */ |
| public void setNameType(final NameType nameType) { |
| this.engine = new PhoneticEngine(nameType, |
| this.engine.getRuleType(), |
| this.engine.isConcat(), |
| this.engine.getMaxPhonemes()); |
| } |
| |
| /** |
| * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered. |
| * |
| * @param ruleType |
| * {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches |
| */ |
| public void setRuleType(final RuleType ruleType) { |
| this.engine = new PhoneticEngine(this.engine.getNameType(), |
| ruleType, |
| this.engine.isConcat(), |
| this.engine.getMaxPhonemes()); |
| } |
| |
| /** |
| * Sets the number of maximum of phonemes that shall be considered by the engine. |
| * |
| * @param maxPhonemes |
| * the maximum number of phonemes returned by the engine |
| * @since 1.7 |
| */ |
| public void setMaxPhonemes(final int maxPhonemes) { |
| this.engine = new PhoneticEngine(this.engine.getNameType(), |
| this.engine.getRuleType(), |
| this.engine.isConcat(), |
| maxPhonemes); |
| } |
| |
| } |