src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java - commons-codec - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.commons.codec.language;

 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
 import java.util.Set;

 import org.apache.commons.codec.CharEncoding;
 import org.apache.commons.codec.EncoderException;
 import org.apache.commons.codec.StringEncoder;

 /**
  * Encodes a string into a Daitch-Mokotoff Soundex value.
  * <p>
  * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
  * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
  * </p>
  * <p>
  * The main differences compared to the other soundex variants are:
  * </p>
  * <ul>
  * <li>coded names are 6 digits long
  * <li>the initial character of the name is coded
  * <li>rules to encoded multi-character n-grams
  * <li>multiple possible encodings for the same name (branching)
  * </ul>
  * <p>
  * This implementation supports branching, depending on the used method:
  * <ul>
  * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
  * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
  * </ul>
  * <p>
  * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
  * rules can be customized by overriding the default rules contained in the resource file
  * {@code org/apache/commons/codec/language/dmrules.txt}.
  * </p>
  * <p>
  * This class is thread-safe.
  * </p>
  *
  * @see Soundex
  * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
  * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
  *
  * @version $Id$
  * @since 1.10
  */
 public class DaitchMokotoffSoundex implements StringEncoder {

     /**
      * Inner class representing a branch during DM soundex encoding.
      */
     private static final class Branch {
         private final StringBuilder builder;
         private String cachedString;
         private String lastReplacement;

         private Branch() {
             builder = new StringBuilder();
             lastReplacement = null;
             cachedString = null;
         }

         /**
          * Creates a new branch, identical to this branch.
          *
          * @return a new, identical branch
          */
         public Branch createBranch() {
             final Branch branch = new Branch();
             branch.builder.append(toString());
             branch.lastReplacement = this.lastReplacement;
             return branch;
         }

         @Override
         public boolean equals(final Object other) {
             if (this == other) {
                 return true;
             }
             if (!(other instanceof Branch)) {
                 return false;
             }

             return toString().equals(((Branch) other).toString());
         }

         /**
          * Finish this branch by appending '0's until the maximum code length has been reached.
          */
         public void finish() {
             while (builder.length() < MAX_LENGTH) {
                 builder.append('0');
                 cachedString = null;
             }
         }

         @Override
         public int hashCode() {
             return toString().hashCode();
         }

         /**
          * Process the next replacement to be added to this branch.
          *
          * @param replacement
          *            the next replacement to append
          * @param forceAppend
          *            indicates if the default processing shall be overridden
          */
         public void processNextReplacement(final String replacement, final boolean forceAppend) {
             final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;

             if (append && builder.length() < MAX_LENGTH) {
                 builder.append(replacement);
                 // remove all characters after the maximum length
                 if (builder.length() > MAX_LENGTH) {
                     builder.delete(MAX_LENGTH, builder.length());
                 }
                 cachedString = null;
             }

             lastReplacement = replacement;
         }

         @Override
         public String toString() {
             if (cachedString == null) {
                 cachedString = builder.toString();
             }
             return cachedString;
         }
     }

     /**
      * Inner class for storing rules.
      */
     private static final class Rule {
         private final String pattern;
         private final String[] replacementAtStart;
         private final String[] replacementBeforeVowel;
         private final String[] replacementDefault;

         protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
                 final String replacementDefault) {
             this.pattern = pattern;
             this.replacementAtStart = replacementAtStart.split("\\|");
             this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
             this.replacementDefault = replacementDefault.split("\\|");
         }

         public int getPatternLength() {
             return pattern.length();
         }

         public String[] getReplacements(final String context, final boolean atStart) {
             if (atStart) {
                 return replacementAtStart;
             }

             final int nextIndex = getPatternLength();
             final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
             if (nextCharIsVowel) {
                 return replacementBeforeVowel;
             }

             return replacementDefault;
         }

         private boolean isVowel(final char ch) {
             return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
         }

         public boolean matches(final String context) {
             return context.startsWith(pattern);
         }

         @Override
         public String toString() {
             return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
                     Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
         }
     }

     private static final String COMMENT = "//";
     private static final String DOUBLE_QUOTE = "\"";

     private static final String MULTILINE_COMMENT_END = "*/";

     private static final String MULTILINE_COMMENT_START = "/*";

     /** The resource file containing the replacement and folding rules */
     private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";

     /** The code length of a DM soundex value. */
     private static final int MAX_LENGTH = 6;

     /** Transformation rules indexed by the first character of their pattern. */
     private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();

     /** Folding rules. */
     private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();

     static {
         final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
         if (rulesIS == null) {
             throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
         }

         final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
         parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
         scanner.close();

         // sort RULES by pattern length in descending order
         for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
             final List<Rule> ruleList = rule.getValue();
             Collections.sort(ruleList, new Comparator<Rule>() {
                 @Override
                 public int compare(final Rule rule1, final Rule rule2) {
                     return rule2.getPatternLength() - rule1.getPatternLength();
                 }
             });
         }
     }

     private static void parseRules(final Scanner scanner, final String location,
             final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
         int currentLine = 0;
         boolean inMultilineComment = false;

         while (scanner.hasNextLine()) {
             currentLine++;
             final String rawLine = scanner.nextLine();
             String line = rawLine;

             if (inMultilineComment) {
                 if (line.endsWith(MULTILINE_COMMENT_END)) {
                     inMultilineComment = false;
                 }
                 continue;
             }

             if (line.startsWith(MULTILINE_COMMENT_START)) {
                 inMultilineComment = true;
             } else {
                 // discard comments
                 final int cmtI = line.indexOf(COMMENT);
                 if (cmtI >= 0) {
                     line = line.substring(0, cmtI);
                 }

                 // trim leading-trailing whitespace
                 line = line.trim();

                 if (line.length() == 0) {
                     continue; // empty lines can be safely skipped
                 }

                 if (line.contains("=")) {
                     // folding
                     final String[] parts = line.split("=");
                     if (parts.length != 2) {
                         throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
                                 " parts: " + rawLine + " in " + location);
                     } else {
                         final String leftCharacter = parts[0];
                         final String rightCharacter = parts[1];

                         if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
                             throw new IllegalArgumentException("Malformed folding statement - " +
                                     "patterns are not single characters: " + rawLine + " in " + location);
                         }

                         asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
                     }
                 } else {
                     // rule
                     final String[] parts = line.split("\\s+");
                     if (parts.length != 4) {
                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
                                 " parts: " + rawLine + " in " + location);
                     } else {
                         try {
                             final String pattern = stripQuotes(parts[0]);
                             final String replacement1 = stripQuotes(parts[1]);
                             final String replacement2 = stripQuotes(parts[2]);
                             final String replacement3 = stripQuotes(parts[3]);

                             final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
                             final char patternKey = r.pattern.charAt(0);
                             List<Rule> rules = ruleMapping.get(patternKey);
                             if (rules == null) {
                                 rules = new ArrayList<Rule>();
                                 ruleMapping.put(patternKey, rules);
                             }
                             rules.add(r);
                         } catch (final IllegalArgumentException e) {
                             throw new IllegalStateException(
                                     "Problem parsing line '" + currentLine + "' in " + location, e);
                         }
                     }
                 }
             }
         }
     }

     private static String stripQuotes(String str) {
         if (str.startsWith(DOUBLE_QUOTE)) {
             str = str.substring(1);
         }

         if (str.endsWith(DOUBLE_QUOTE)) {
             str = str.substring(0, str.length() - 1);
         }

         return str;
     }

     /** Whether to use ASCII folding prior to encoding. */
     private final boolean folding;

     /**
      * Creates a new instance with ASCII-folding enabled.
      */
     public DaitchMokotoffSoundex() {
         this(true);
     }

     /**
      * Creates a new instance.
      * <p>
      * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
      * è -&gt; e.
      * </p>
      *
      * @param folding
      *            if ASCII-folding shall be performed before encoding
      */
     public DaitchMokotoffSoundex(final boolean folding) {
         this.folding = folding;
     }

     /**
      * Performs a cleanup of the input string before the actual soundex transformation.
      * <p>
      * Removes all whitespace characters and performs ASCII folding if enabled.
      * </p>
      *
      * @param input
      *            the input string to cleanup
      * @return a cleaned up string
      */
     private String cleanup(final String input) {
         final StringBuilder sb = new StringBuilder();
         for (char ch : input.toCharArray()) {
             if (Character.isWhitespace(ch)) {
                 continue;
             }

             ch = Character.toLowerCase(ch);
             if (folding && FOLDINGS.containsKey(ch)) {
                 ch = FOLDINGS.get(ch);
             }
             sb.append(ch);
         }
         return sb.toString();
     }

     /**
      * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
      * <p>
      * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
      * EncoderException if the supplied object is not of type java.lang.String.
      * </p>
      *
      * @see #soundex(String)
      *
      * @param obj
      *            Object to encode
      * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
      *         supplied.
      * @throws EncoderException
      *             if the parameter supplied is not of type java.lang.String
      * @throws IllegalArgumentException
      *             if a character is not mapped
      */
     @Override
     public Object encode(final Object obj) throws EncoderException {
         if (!(obj instanceof String)) {
             throw new EncoderException(
                     "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
         }
         return encode((String) obj);
     }

     /**
      * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
      *
      * @see #soundex(String)
      *
      * @param source
      *            A String object to encode
      * @return A DM Soundex code corresponding to the String supplied
      * @throws IllegalArgumentException
      *             if a character is not mapped
      */
     @Override
     public String encode(final String source) {
         if (source == null) {
             return null;
         }
         return soundex(source, false)[0];
     }

     /**
      * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
      * <p>
      * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
      * separated by '|'.
      * </p>
      * <p>
      * Example: the name "AUERBACH" is encoded as both
      * </p>
      * <ul>
      * <li>097400</li>
      * <li>097500</li>
      * </ul>
      * <p>
      * Thus the result will be "097400|097500".
      * </p>
      *
      * @param source
      *            A String object to encode
      * @return A string containing a set of DM Soundex codes corresponding to the String supplied
      * @throws IllegalArgumentException
      *             if a character is not mapped
      */
     public String soundex(final String source) {
         final String[] branches = soundex(source, true);
         final StringBuilder sb = new StringBuilder();
         int index = 0;
         for (final String branch : branches) {
             sb.append(branch);
             if (++index < branches.length) {
                 sb.append('|');
             }
         }
         return sb.toString();
     }

     /**
      * Perform the actual DM Soundex algorithm on the input string.
      *
      * @param source
      *            A String object to encode
      * @param branching
      *            If branching shall be performed
      * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
      *         selected branching mode
      */
     private String[] soundex(final String source, final boolean branching) {
         if (source == null) {
             return null;
         }

         final String input = cleanup(source);

         final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
         currentBranches.add(new Branch());

         char lastChar = '\0';
         for (int index = 0; index < input.length(); index++) {
             final char ch = input.charAt(index);

             // ignore whitespace inside a name
             if (Character.isWhitespace(ch)) {
                 continue;
             }

             final String inputContext = input.substring(index);
             final List<Rule> rules = RULES.get(ch);
             if (rules == null) {
                 continue;
             }

             // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
             @SuppressWarnings("unchecked")
             final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;

             for (final Rule rule : rules) {
                 if (rule.matches(inputContext)) {
                     if (branching) {
                         nextBranches.clear();
                     }
                     final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
                     final boolean branchingRequired = replacements.length > 1 && branching;

                     for (final Branch branch : currentBranches) {
                         for (final String nextReplacement : replacements) {
                             // if we have multiple replacements, always create a new branch
                             final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;

                             // special rule: occurrences of mn or nm are treated differently
                             final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');

                             nextBranch.processNextReplacement(nextReplacement, force);

                             if (branching) {
                                 nextBranches.add(nextBranch);
                             } else {
                                 break;
                             }
                         }
                     }

                     if (branching) {
                         currentBranches.clear();
                         currentBranches.addAll(nextBranches);
                     }
                     index += rule.getPatternLength() - 1;
                     break;
                 }
             }

             lastChar = ch;
         }

         final String[] result = new String[currentBranches.size()];
         int index = 0;
         for (final Branch branch : currentBranches) {
             branch.finish();
             result[index++] = branch.toString();
         }

         return result;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.commons.codec.language;

	import java.io.InputStream;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.HashMap;
	import java.util.LinkedHashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Scanner;
	import java.util.Set;

	import org.apache.commons.codec.CharEncoding;
	import org.apache.commons.codec.EncoderException;
	import org.apache.commons.codec.StringEncoder;

	/**
	* Encodes a string into a Daitch-Mokotoff Soundex value.
	* <p>
	* The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
	* accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
	* </p>
	* <p>
	* The main differences compared to the other soundex variants are:
	* </p>
	* <ul>
	* <li>coded names are 6 digits long
	* <li>the initial character of the name is coded
	* <li>rules to encoded multi-character n-grams
	* <li>multiple possible encodings for the same name (branching)
	* </ul>
	* <p>
	* This implementation supports branching, depending on the used method:
	* <ul>
	* <li>{@link #encode(String)} - branching disabled, only the first code will be returned
	* <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '\|'
	* </ul>
	* <p>
	* Note: this implementation has additional branching rules compared to the original description of the algorithm. The
	* rules can be customized by overriding the default rules contained in the resource file
	* {@code org/apache/commons/codec/language/dmrules.txt}.
	* </p>
	* <p>
	* This class is thread-safe.
	* </p>
	*
	* @see Soundex
	* @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
	* @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
	*
	* @version $Id$
	* @since 1.10
	*/
	public class DaitchMokotoffSoundex implements StringEncoder {

	/**
	* Inner class representing a branch during DM soundex encoding.
	*/
	private static final class Branch {
	private final StringBuilder builder;
	private String cachedString;
	private String lastReplacement;

	private Branch() {
	builder = new StringBuilder();
	lastReplacement = null;
	cachedString = null;
	}

	/**
	* Creates a new branch, identical to this branch.
	*
	* @return a new, identical branch
	*/
	public Branch createBranch() {
	final Branch branch = new Branch();
	branch.builder.append(toString());
	branch.lastReplacement = this.lastReplacement;
	return branch;
	}

	@Override
	public boolean equals(final Object other) {
	if (this == other) {
	return true;
	}
	if (!(other instanceof Branch)) {
	return false;
	}

	return toString().equals(((Branch) other).toString());
	}

	/**
	* Finish this branch by appending '0's until the maximum code length has been reached.
	*/
	public void finish() {
	while (builder.length() < MAX_LENGTH) {
	builder.append('0');
	cachedString = null;
	}
	}

	@Override
	public int hashCode() {
	return toString().hashCode();
	}

	/**
	* Process the next replacement to be added to this branch.
	*
	* @param replacement
	* the next replacement to append
	* @param forceAppend
	* indicates if the default processing shall be overridden
	*/
	public void processNextReplacement(final String replacement, final boolean forceAppend) {
	final boolean append = lastReplacement == null \|\| !lastReplacement.endsWith(replacement) \|\| forceAppend;

	if (append && builder.length() < MAX_LENGTH) {
	builder.append(replacement);
	// remove all characters after the maximum length
	if (builder.length() > MAX_LENGTH) {
	builder.delete(MAX_LENGTH, builder.length());
	}
	cachedString = null;
	}

	lastReplacement = replacement;
	}

	@Override
	public String toString() {
	if (cachedString == null) {
	cachedString = builder.toString();
	}
	return cachedString;
	}
	}

	/**
	* Inner class for storing rules.
	*/
	private static final class Rule {
	private final String pattern;
	private final String[] replacementAtStart;
	private final String[] replacementBeforeVowel;
	private final String[] replacementDefault;

	protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
	final String replacementDefault) {
	this.pattern = pattern;
	this.replacementAtStart = replacementAtStart.split("\\\|");
	this.replacementBeforeVowel = replacementBeforeVowel.split("\\\|");
	this.replacementDefault = replacementDefault.split("\\\|");
	}

	public int getPatternLength() {
	return pattern.length();
	}

	public String[] getReplacements(final String context, final boolean atStart) {
	if (atStart) {
	return replacementAtStart;
	}

	final int nextIndex = getPatternLength();
	final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
	if (nextCharIsVowel) {
	return replacementBeforeVowel;
	}

	return replacementDefault;
	}

	private boolean isVowel(final char ch) {
	return ch == 'a' \|\| ch == 'e' \|\| ch == 'i' \|\| ch == 'o' \|\| ch == 'u';
	}

	public boolean matches(final String context) {
	return context.startsWith(pattern);
	}

	@Override
	public String toString() {
	return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
	Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
	}
	}

	private static final String COMMENT = "//";
	private static final String DOUBLE_QUOTE = "\"";

	private static final String MULTILINE_COMMENT_END = "*/";

	private static final String MULTILINE_COMMENT_START = "/*";

	/** The resource file containing the replacement and folding rules */
	private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";

	/** The code length of a DM soundex value. */
	private static final int MAX_LENGTH = 6;

	/** Transformation rules indexed by the first character of their pattern. */
	private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();

	/** Folding rules. */
	private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();

	static {
	final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
	if (rulesIS == null) {
	throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
	}

	final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
	parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
	scanner.close();

	// sort RULES by pattern length in descending order
	for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
	final List<Rule> ruleList = rule.getValue();
	Collections.sort(ruleList, new Comparator<Rule>() {
	@Override
	public int compare(final Rule rule1, final Rule rule2) {
	return rule2.getPatternLength() - rule1.getPatternLength();
	}
	});
	}
	}

	private static void parseRules(final Scanner scanner, final String location,
	final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
	int currentLine = 0;
	boolean inMultilineComment = false;

	while (scanner.hasNextLine()) {
	currentLine++;
	final String rawLine = scanner.nextLine();
	String line = rawLine;

	if (inMultilineComment) {
	if (line.endsWith(MULTILINE_COMMENT_END)) {
	inMultilineComment = false;
	}
	continue;
	}

	if (line.startsWith(MULTILINE_COMMENT_START)) {
	inMultilineComment = true;
	} else {
	// discard comments
	final int cmtI = line.indexOf(COMMENT);
	if (cmtI >= 0) {
	line = line.substring(0, cmtI);
	}

	// trim leading-trailing whitespace
	line = line.trim();

	if (line.length() == 0) {
	continue; // empty lines can be safely skipped
	}

	if (line.contains("=")) {
	// folding
	final String[] parts = line.split("=");
	if (parts.length != 2) {
	throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
	" parts: " + rawLine + " in " + location);
	} else {
	final String leftCharacter = parts[0];
	final String rightCharacter = parts[1];

	if (leftCharacter.length() != 1 \|\| rightCharacter.length() != 1) {
	throw new IllegalArgumentException("Malformed folding statement - " +
	"patterns are not single characters: " + rawLine + " in " + location);
	}

	asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
	}
	} else {
	// rule
	final String[] parts = line.split("\\s+");
	if (parts.length != 4) {
	throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
	" parts: " + rawLine + " in " + location);
	} else {
	try {
	final String pattern = stripQuotes(parts[0]);
	final String replacement1 = stripQuotes(parts[1]);
	final String replacement2 = stripQuotes(parts[2]);
	final String replacement3 = stripQuotes(parts[3]);

	final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
	final char patternKey = r.pattern.charAt(0);
	List<Rule> rules = ruleMapping.get(patternKey);
	if (rules == null) {
	rules = new ArrayList<Rule>();
	ruleMapping.put(patternKey, rules);
	}
	rules.add(r);
	} catch (final IllegalArgumentException e) {
	throw new IllegalStateException(
	"Problem parsing line '" + currentLine + "' in " + location, e);
	}
	}
	}
	}
	}
	}

	private static String stripQuotes(String str) {
	if (str.startsWith(DOUBLE_QUOTE)) {
	str = str.substring(1);
	}

	if (str.endsWith(DOUBLE_QUOTE)) {
	str = str.substring(0, str.length() - 1);
	}

	return str;
	}

	/** Whether to use ASCII folding prior to encoding. */
	private final boolean folding;

	/**
	* Creates a new instance with ASCII-folding enabled.
	*/
	public DaitchMokotoffSoundex() {
	this(true);
	}

	/**
	* Creates a new instance.
	* <p>
	* With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
	* è -> e.
	* </p>
	*
	* @param folding
	* if ASCII-folding shall be performed before encoding
	*/
	public DaitchMokotoffSoundex(final boolean folding) {
	this.folding = folding;
	}

	/**
	* Performs a cleanup of the input string before the actual soundex transformation.
	* <p>
	* Removes all whitespace characters and performs ASCII folding if enabled.
	* </p>
	*
	* @param input
	* the input string to cleanup
	* @return a cleaned up string
	*/
	private String cleanup(final String input) {
	final StringBuilder sb = new StringBuilder();
	for (char ch : input.toCharArray()) {
	if (Character.isWhitespace(ch)) {
	continue;
	}

	ch = Character.toLowerCase(ch);
	if (folding && FOLDINGS.containsKey(ch)) {
	ch = FOLDINGS.get(ch);
	}
	sb.append(ch);
	}
	return sb.toString();
	}

	/**
	* Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
	* <p>
	* This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
	* EncoderException if the supplied object is not of type java.lang.String.
	* </p>
	*
	* @see #soundex(String)
	*
	* @param obj
	* Object to encode
	* @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
	* supplied.
	* @throws EncoderException
	* if the parameter supplied is not of type java.lang.String
	* @throws IllegalArgumentException
	* if a character is not mapped
	*/
	@Override
	public Object encode(final Object obj) throws EncoderException {
	if (!(obj instanceof String)) {
	throw new EncoderException(
	"Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
	}
	return encode((String) obj);
	}

	/**
	* Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
	*
	* @see #soundex(String)
	*
	* @param source
	* A String object to encode
	* @return A DM Soundex code corresponding to the String supplied
	* @throws IllegalArgumentException
	* if a character is not mapped
	*/
	@Override
	public String encode(final String source) {
	if (source == null) {
	return null;
	}
	return soundex(source, false)[0];
	}

	/**
	* Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
	* <p>
	* In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
	* separated by '\|'.
	* </p>
	* <p>
	* Example: the name "AUERBACH" is encoded as both
	* </p>
	* <ul>
	* <li>097400</li>
	* <li>097500</li>
	* </ul>
	* <p>
	* Thus the result will be "097400\|097500".
	* </p>
	*
	* @param source
	* A String object to encode
	* @return A string containing a set of DM Soundex codes corresponding to the String supplied
	* @throws IllegalArgumentException
	* if a character is not mapped
	*/
	public String soundex(final String source) {
	final String[] branches = soundex(source, true);
	final StringBuilder sb = new StringBuilder();
	int index = 0;
	for (final String branch : branches) {
	sb.append(branch);
	if (++index < branches.length) {
	sb.append('\|');
	}
	}
	return sb.toString();
	}

	/**
	* Perform the actual DM Soundex algorithm on the input string.
	*
	* @param source
	* A String object to encode
	* @param branching
	* If branching shall be performed
	* @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
	* selected branching mode
	*/
	private String[] soundex(final String source, final boolean branching) {
	if (source == null) {
	return null;
	}

	final String input = cleanup(source);

	final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
	currentBranches.add(new Branch());

	char lastChar = '\0';
	for (int index = 0; index < input.length(); index++) {
	final char ch = input.charAt(index);

	// ignore whitespace inside a name
	if (Character.isWhitespace(ch)) {
	continue;
	}

	final String inputContext = input.substring(index);
	final List<Rule> rules = RULES.get(ch);
	if (rules == null) {
	continue;
	}

	// use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
	@SuppressWarnings("unchecked")
	final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;

	for (final Rule rule : rules) {
	if (rule.matches(inputContext)) {
	if (branching) {
	nextBranches.clear();
	}
	final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
	final boolean branchingRequired = replacements.length > 1 && branching;

	for (final Branch branch : currentBranches) {
	for (final String nextReplacement : replacements) {
	// if we have multiple replacements, always create a new branch
	final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;

	// special rule: occurrences of mn or nm are treated differently
	final boolean force = (lastChar == 'm' && ch == 'n') \|\| (lastChar == 'n' && ch == 'm');

	nextBranch.processNextReplacement(nextReplacement, force);

	if (branching) {
	nextBranches.add(nextBranch);
	} else {
	break;
	}
	}
	}

	if (branching) {
	currentBranches.clear();
	currentBranches.addAll(nextBranches);
	}
	index += rule.getPatternLength() - 1;
	break;
	}
	}

	lastChar = ch;
	}

	final String[] result = new String[currentBranches.size()];
	int index = 0;
	for (final Branch branch : currentBranches) {
	branch.finish();
	result[index++] = branch.toString();
	}

	return result;
	}
	}