| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.codec.language; |
| |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Scanner; |
| import java.util.Set; |
| |
| import org.apache.commons.codec.CharEncoding; |
| import org.apache.commons.codec.EncoderException; |
| import org.apache.commons.codec.Resources; |
| import org.apache.commons.codec.StringEncoder; |
| |
| /** |
| * Encodes a string into a Daitch-Mokotoff Soundex value. |
| * <p> |
| * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater |
| * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. |
| * </p> |
| * <p> |
| * The main differences compared to the other soundex variants are: |
| * </p> |
| * <ul> |
| * <li>coded names are 6 digits long |
| * <li>the initial character of the name is coded |
| * <li>rules to encoded multi-character n-grams |
| * <li>multiple possible encodings for the same name (branching) |
| * </ul> |
| * <p> |
| * This implementation supports branching, depending on the used method: |
| * <ul> |
| * <li>{@link #encode(String)} - branching disabled, only the first code will be returned |
| * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' |
| * </ul> |
| * <p> |
| * Note: this implementation has additional branching rules compared to the original description of the algorithm. The |
| * rules can be customized by overriding the default rules contained in the resource file |
| * {@code org/apache/commons/codec/language/dmrules.txt}. |
| * </p> |
| * <p> |
| * This class is thread-safe. |
| * </p> |
| * |
| * @see Soundex |
| * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> |
| * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> |
| * |
| * @since 1.10 |
| */ |
| public class DaitchMokotoffSoundex implements StringEncoder { |
| |
| /** |
| * Inner class representing a branch during DM soundex encoding. |
| */ |
| private static final class Branch { |
| private final StringBuilder builder; |
| private String cachedString; |
| private String lastReplacement; |
| |
| private Branch() { |
| builder = new StringBuilder(); |
| lastReplacement = null; |
| cachedString = null; |
| } |
| |
| /** |
| * Creates a new branch, identical to this branch. |
| * |
| * @return a new, identical branch |
| */ |
| public Branch createBranch() { |
| final Branch branch = new Branch(); |
| branch.builder.append(toString()); |
| branch.lastReplacement = this.lastReplacement; |
| return branch; |
| } |
| |
| @Override |
| public boolean equals(final Object other) { |
| if (this == other) { |
| return true; |
| } |
| if (!(other instanceof Branch)) { |
| return false; |
| } |
| |
| return toString().equals(((Branch) other).toString()); |
| } |
| |
| /** |
| * Finish this branch by appending '0's until the maximum code length has been reached. |
| */ |
| public void finish() { |
| while (builder.length() < MAX_LENGTH) { |
| builder.append('0'); |
| cachedString = null; |
| } |
| } |
| |
| @Override |
| public int hashCode() { |
| return toString().hashCode(); |
| } |
| |
| /** |
| * Process the next replacement to be added to this branch. |
| * |
| * @param replacement |
| * the next replacement to append |
| * @param forceAppend |
| * indicates if the default processing shall be overridden |
| */ |
| public void processNextReplacement(final String replacement, final boolean forceAppend) { |
| final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; |
| |
| if (append && builder.length() < MAX_LENGTH) { |
| builder.append(replacement); |
| // remove all characters after the maximum length |
| if (builder.length() > MAX_LENGTH) { |
| builder.delete(MAX_LENGTH, builder.length()); |
| } |
| cachedString = null; |
| } |
| |
| lastReplacement = replacement; |
| } |
| |
| @Override |
| public String toString() { |
| if (cachedString == null) { |
| cachedString = builder.toString(); |
| } |
| return cachedString; |
| } |
| } |
| |
| /** |
| * Inner class for storing rules. |
| */ |
| private static final class Rule { |
| private final String pattern; |
| private final String[] replacementAtStart; |
| private final String[] replacementBeforeVowel; |
| private final String[] replacementDefault; |
| |
| protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, |
| final String replacementDefault) { |
| this.pattern = pattern; |
| this.replacementAtStart = replacementAtStart.split("\\|"); |
| this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); |
| this.replacementDefault = replacementDefault.split("\\|"); |
| } |
| |
| public int getPatternLength() { |
| return pattern.length(); |
| } |
| |
| public String[] getReplacements(final String context, final boolean atStart) { |
| if (atStart) { |
| return replacementAtStart; |
| } |
| |
| final int nextIndex = getPatternLength(); |
| final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; |
| if (nextCharIsVowel) { |
| return replacementBeforeVowel; |
| } |
| |
| return replacementDefault; |
| } |
| |
| private boolean isVowel(final char ch) { |
| return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; |
| } |
| |
| public boolean matches(final String context) { |
| return context.startsWith(pattern); |
| } |
| |
| @Override |
| public String toString() { |
| return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), |
| Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); |
| } |
| } |
| |
| private static final String COMMENT = "//"; |
| private static final String DOUBLE_QUOTE = "\""; |
| |
| private static final String MULTILINE_COMMENT_END = "*/"; |
| |
| private static final String MULTILINE_COMMENT_START = "/*"; |
| |
| /** The resource file containing the replacement and folding rules */ |
| private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; |
| |
| /** The code length of a DM soundex value. */ |
| private static final int MAX_LENGTH = 6; |
| |
| /** Transformation rules indexed by the first character of their pattern. */ |
| private static final Map<Character, List<Rule>> RULES = new HashMap<>(); |
| |
| /** Folding rules. */ |
| private static final Map<Character, Character> FOLDINGS = new HashMap<>(); |
| |
| static { |
| try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) { |
| parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); |
| } |
| |
| // sort RULES by pattern length in descending order |
| for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) { |
| final List<Rule> ruleList = rule.getValue(); |
| Collections.sort(ruleList, new Comparator<Rule>() { |
| @Override |
| public int compare(final Rule rule1, final Rule rule2) { |
| return rule2.getPatternLength() - rule1.getPatternLength(); |
| } |
| }); |
| } |
| } |
| |
| private static void parseRules(final Scanner scanner, final String location, |
| final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { |
| int currentLine = 0; |
| boolean inMultilineComment = false; |
| |
| while (scanner.hasNextLine()) { |
| currentLine++; |
| final String rawLine = scanner.nextLine(); |
| String line = rawLine; |
| |
| if (inMultilineComment) { |
| if (line.endsWith(MULTILINE_COMMENT_END)) { |
| inMultilineComment = false; |
| } |
| continue; |
| } |
| |
| if (line.startsWith(MULTILINE_COMMENT_START)) { |
| inMultilineComment = true; |
| } else { |
| // discard comments |
| final int cmtI = line.indexOf(COMMENT); |
| if (cmtI >= 0) { |
| line = line.substring(0, cmtI); |
| } |
| |
| // trim leading-trailing whitespace |
| line = line.trim(); |
| |
| if (line.isEmpty()) { |
| continue; // empty lines can be safely skipped |
| } |
| |
| if (line.contains("=")) { |
| // folding |
| final String[] parts = line.split("="); |
| if (parts.length != 2) { |
| throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + |
| " parts: " + rawLine + " in " + location); |
| } |
| final String leftCharacter = parts[0]; |
| final String rightCharacter = parts[1]; |
| |
| if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { |
| throw new IllegalArgumentException("Malformed folding statement - " + |
| "patterns are not single characters: " + rawLine + " in " + location); |
| } |
| |
| asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); |
| } else { |
| // rule |
| final String[] parts = line.split("\\s+"); |
| if (parts.length != 4) { |
| throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + |
| " parts: " + rawLine + " in " + location); |
| } |
| try { |
| final String pattern = stripQuotes(parts[0]); |
| final String replacement1 = stripQuotes(parts[1]); |
| final String replacement2 = stripQuotes(parts[2]); |
| final String replacement3 = stripQuotes(parts[3]); |
| |
| final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); |
| final char patternKey = r.pattern.charAt(0); |
| List<Rule> rules = ruleMapping.get(patternKey); |
| if (rules == null) { |
| rules = new ArrayList<>(); |
| ruleMapping.put(patternKey, rules); |
| } |
| rules.add(r); |
| } catch (final IllegalArgumentException e) { |
| throw new IllegalStateException( |
| "Problem parsing line '" + currentLine + "' in " + location, e); |
| } |
| } |
| } |
| } |
| } |
| |
| private static String stripQuotes(String str) { |
| if (str.startsWith(DOUBLE_QUOTE)) { |
| str = str.substring(1); |
| } |
| |
| if (str.endsWith(DOUBLE_QUOTE)) { |
| str = str.substring(0, str.length() - 1); |
| } |
| |
| return str; |
| } |
| |
| /** Whether to use ASCII folding prior to encoding. */ |
| private final boolean folding; |
| |
| /** |
| * Creates a new instance with ASCII-folding enabled. |
| */ |
| public DaitchMokotoffSoundex() { |
| this(true); |
| } |
| |
| /** |
| * Creates a new instance. |
| * <p> |
| * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. |
| * è -> e. |
| * </p> |
| * |
| * @param folding |
| * if ASCII-folding shall be performed before encoding |
| */ |
| public DaitchMokotoffSoundex(final boolean folding) { |
| this.folding = folding; |
| } |
| |
| /** |
| * Performs a cleanup of the input string before the actual soundex transformation. |
| * <p> |
| * Removes all whitespace characters and performs ASCII folding if enabled. |
| * </p> |
| * |
| * @param input |
| * the input string to cleanup |
| * @return a cleaned up string |
| */ |
| private String cleanup(final String input) { |
| final StringBuilder sb = new StringBuilder(); |
| for (char ch : input.toCharArray()) { |
| if (Character.isWhitespace(ch)) { |
| continue; |
| } |
| |
| ch = Character.toLowerCase(ch); |
| if (folding && FOLDINGS.containsKey(ch)) { |
| ch = FOLDINGS.get(ch); |
| } |
| sb.append(ch); |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. |
| * <p> |
| * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an |
| * EncoderException if the supplied object is not of type java.lang.String. |
| * </p> |
| * |
| * @see #soundex(String) |
| * |
| * @param obj |
| * Object to encode |
| * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String |
| * supplied. |
| * @throws EncoderException |
| * if the parameter supplied is not of type java.lang.String |
| * @throws IllegalArgumentException |
| * if a character is not mapped |
| */ |
| @Override |
| public Object encode(final Object obj) throws EncoderException { |
| if (!(obj instanceof String)) { |
| throw new EncoderException( |
| "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); |
| } |
| return encode((String) obj); |
| } |
| |
| /** |
| * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. |
| * |
| * @see #soundex(String) |
| * |
| * @param source |
| * A String object to encode |
| * @return A DM Soundex code corresponding to the String supplied |
| * @throws IllegalArgumentException |
| * if a character is not mapped |
| */ |
| @Override |
| public String encode(final String source) { |
| if (source == null) { |
| return null; |
| } |
| return soundex(source, false)[0]; |
| } |
| |
| /** |
| * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. |
| * <p> |
| * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, |
| * separated by '|'. |
| * </p> |
| * <p> |
| * Example: the name "AUERBACH" is encoded as both |
| * </p> |
| * <ul> |
| * <li>097400</li> |
| * <li>097500</li> |
| * </ul> |
| * <p> |
| * Thus the result will be "097400|097500". |
| * </p> |
| * |
| * @param source |
| * A String object to encode |
| * @return A string containing a set of DM Soundex codes corresponding to the String supplied |
| * @throws IllegalArgumentException |
| * if a character is not mapped |
| */ |
| public String soundex(final String source) { |
| final String[] branches = soundex(source, true); |
| final StringBuilder sb = new StringBuilder(); |
| int index = 0; |
| for (final String branch : branches) { |
| sb.append(branch); |
| if (++index < branches.length) { |
| sb.append('|'); |
| } |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * Perform the actual DM Soundex algorithm on the input string. |
| * |
| * @param source |
| * A String object to encode |
| * @param branching |
| * If branching shall be performed |
| * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the |
| * selected branching mode |
| */ |
| private String[] soundex(final String source, final boolean branching) { |
| if (source == null) { |
| return null; |
| } |
| |
| final String input = cleanup(source); |
| |
| final Set<Branch> currentBranches = new LinkedHashSet<>(); |
| currentBranches.add(new Branch()); |
| |
| char lastChar = '\0'; |
| for (int index = 0; index < input.length(); index++) { |
| final char ch = input.charAt(index); |
| |
| // ignore whitespace inside a name |
| if (Character.isWhitespace(ch)) { |
| continue; |
| } |
| |
| final String inputContext = input.substring(index); |
| final List<Rule> rules = RULES.get(ch); |
| if (rules == null) { |
| continue; |
| } |
| |
| // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access |
| final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.<Branch>emptyList(); |
| |
| for (final Rule rule : rules) { |
| if (rule.matches(inputContext)) { |
| if (branching) { |
| nextBranches.clear(); |
| } |
| final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); |
| final boolean branchingRequired = replacements.length > 1 && branching; |
| |
| for (final Branch branch : currentBranches) { |
| for (final String nextReplacement : replacements) { |
| // if we have multiple replacements, always create a new branch |
| final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; |
| |
| // special rule: occurrences of mn or nm are treated differently |
| final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); |
| |
| nextBranch.processNextReplacement(nextReplacement, force); |
| |
| if (branching) { |
| nextBranches.add(nextBranch); |
| } else { |
| break; |
| } |
| } |
| } |
| |
| if (branching) { |
| currentBranches.clear(); |
| currentBranches.addAll(nextBranches); |
| } |
| index += rule.getPatternLength() - 1; |
| break; |
| } |
| } |
| |
| lastChar = ch; |
| } |
| |
| final String[] result = new String[currentBranches.size()]; |
| int index = 0; |
| for (final Branch branch : currentBranches) { |
| branch.finish(); |
| result[index++] = branch.toString(); |
| } |
| |
| return result; |
| } |
| } |