src/main/java/org/apache/commons/codec/language/bm/Rule.java - commons-codec - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.commons.codec.language.bm;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.EnumMap;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.commons.codec.Resources;
 import org.apache.commons.codec.language.bm.Languages.LanguageSet;

 /**
  * A phoneme rule.
  * <p>
  * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
  * and a logical flag indicating if all languages must be in play. A rule matches if:
  * <ul>
  * <li>the pattern matches at the current position</li>
  * <li>the string up until the beginning of the pattern matches the left context</li>
  * <li>the string from the end of the pattern matches the right context</li>
  * <li>logical is ALL and all languages are in scope; or</li>
  * <li>logical is any other value and at least one language is in scope</li>
  * </ul>
  * <p>
  * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
  * to explicitly construct their own.
  * <p>
  * Rules are immutable and thread-safe.
  * <p>
  * <b>Rules resources</b>
  * <p>
  * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
  * named following the pattern:
  * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
  * <p>
  * The format of these resources is the following:
  * <ul>
  * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
  * will be interpreted as:
  * <ol>
  * <li>pattern</li>
  * <li>left context</li>
  * <li>right context</li>
  * <li>phoneme</li>
  * </ol>
  * </li>
  * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
  * as a comment.</li>
  * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
  * all content until a line ending in '*' and '/' is found.</li>
  * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
  * </ul>
  *
  * @since 1.6
  */
 public class Rule {

     public static final class Phoneme implements PhonemeExpr {
         public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
             @Override
             public int compare(final Phoneme o1, final Phoneme o2) {
                 final int o1Length = o1.phonemeText.length();
                 final int o2Length = o2.phonemeText.length();
                 for (int i = 0; i < o1Length; i++) {
                     if (i >= o2Length) {
                         return +1;
                     }
                     final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
                     if (c != 0) {
                         return c;
                     }
                 }

                 if (o1Length < o2Length) {
                     return -1;
                 }

                 return 0;
             }
         };

         private final StringBuilder phonemeText;
         private final Languages.LanguageSet languages;

         public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
             this.phonemeText = new StringBuilder(phonemeText);
             this.languages = languages;
         }

         public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
             this(phonemeLeft.phonemeText, phonemeLeft.languages);
             this.phonemeText.append(phonemeRight.phonemeText);
         }

         public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
             this(phonemeLeft.phonemeText, languages);
             this.phonemeText.append(phonemeRight.phonemeText);
         }

         public Phoneme append(final CharSequence str) {
             this.phonemeText.append(str);
             return this;
         }

         public Languages.LanguageSet getLanguages() {
             return this.languages;
         }

         @Override
         public Iterable<Phoneme> getPhonemes() {
             return Collections.singleton(this);
         }

         public CharSequence getPhonemeText() {
             return this.phonemeText;
         }

         /**
          * Deprecated since 1.9.
          *
          * @param right the Phoneme to join
          * @return a new Phoneme
          * @deprecated since 1.9
          */
         @Deprecated
         public Phoneme join(final Phoneme right) {
             return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
                                this.languages.restrictTo(right.languages));
         }

         /**
          * Returns a new Phoneme with the same text but a union of its
          * current language set and the given one.
          *
          * @param lang the language set to merge
          * @return a new Phoneme
          */
         public Phoneme mergeWithLanguage(final LanguageSet lang) {
           return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
         }

         @Override
         public String toString() {
           return phonemeText.toString() + "[" + languages + "]";
         }
     }

     public interface PhonemeExpr {
         Iterable<Phoneme> getPhonemes();
     }

     public static final class PhonemeList implements PhonemeExpr {
         private final List<Phoneme> phonemes;

         public PhonemeList(final List<Phoneme> phonemes) {
             this.phonemes = phonemes;
         }

         @Override
         public List<Phoneme> getPhonemes() {
             return this.phonemes;
         }
     }

     /**
      * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
      */
     public interface RPattern {
         boolean isMatch(CharSequence input);
     }

     public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
         @Override
         public boolean isMatch(final CharSequence input) {
             return true;
         }
     };

     public static final String ALL = "ALL";

     private static final String DOUBLE_QUOTE = "\"";

     private static final String HASH_INCLUDE = "#include";

     private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();


     private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
             new EnumMap<>(NameType.class);

     static {
         for (final NameType s : NameType.values()) {
             final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
                     new EnumMap<>(RuleType.class);

             for (final RuleType rt : RuleType.values()) {
                 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();

                 final Languages ls = Languages.getInstance(s);
                 for (final String l : ls.getLanguages()) {
                     try (final Scanner scanner = createScanner(s, rt, l)) {
                         rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
                     } catch (final IllegalStateException e) {
                         throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
                     }
                 }
                 if (!rt.equals(RuleType.RULES)) {
                     try (final Scanner scanner = createScanner(s, rt, "common")) {
                         rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
                     }
                 }

                 rts.put(rt, Collections.unmodifiableMap(rs));
             }

             RULES.put(s, Collections.unmodifiableMap(rts));
         }
     }

     private static boolean contains(final CharSequence chars, final char input) {
         for (int i = 0; i < chars.length(); i++) {
             if (chars.charAt(i) == input) {
                 return true;
             }
         }
         return false;
     }

     private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
         return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
                              nameType.getName(), rt.getName(), lang);
     }

     private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
         final String resName = createResourceName(nameType, rt, lang);
         return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
     }

     private static Scanner createScanner(final String lang) {
         final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
         return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
     }

     private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
         final int suffixLength = suffix.length();
         final int inputLength = input.length();

         if (suffixLength > inputLength) {
             return false;
         }
         for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
             if (input.charAt(i) != suffix.charAt(j)) {
                 return false;
             }
         }
         return true;
     }

     /**
      * Gets rules for a combination of name type, rule type and languages.
      *
      * @param nameType
      *            the NameType to consider
      * @param rt
      *            the RuleType to consider
      * @param langs
      *            the set of languages to consider
      * @return a list of Rules that apply
      */
     public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
                                          final Languages.LanguageSet langs) {
         final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
         final List<Rule> allRules = new ArrayList<>();
         for (final List<Rule> rules : ruleMap.values()) {
             allRules.addAll(rules);
         }
         return allRules;
     }

     /**
      * Gets rules for a combination of name type, rule type and a single language.
      *
      * @param nameType
      *            the NameType to consider
      * @param rt
      *            the RuleType to consider
      * @param lang
      *            the language to consider
      * @return a list of Rules that apply
      */
     public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
         return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
     }

     /**
      * Gets rules for a combination of name type, rule type and languages.
      *
      * @param nameType
      *            the NameType to consider
      * @param rt
      *            the RuleType to consider
      * @param langs
      *            the set of languages to consider
      * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
      * @since 1.9
      */
     public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
                                                          final Languages.LanguageSet langs) {
         return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
                                      getInstanceMap(nameType, rt, Languages.ANY);
     }

     /**
      * Gets rules for a combination of name type, rule type and a single language.
      *
      * @param nameType
      *            the NameType to consider
      * @param rt
      *            the RuleType to consider
      * @param lang
      *            the language to consider
      * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
      * @since 1.9
      */
     public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
                                                          final String lang) {
         final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);

         if (rules == null) {
             throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
                                                nameType.getName(), rt.getName(), lang));
         }

         return rules;
     }

     private static Phoneme parsePhoneme(final String ph) {
         final int open = ph.indexOf("[");
         if (open >= 0) {
             if (!ph.endsWith("]")) {
                 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
             }
             final String before = ph.substring(0, open);
             final String in = ph.substring(open + 1, ph.length() - 1);
             final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));

             return new Phoneme(before, Languages.LanguageSet.from(langs));
         }
         return new Phoneme(ph, Languages.ANY_LANGUAGE);
     }

     private static PhonemeExpr parsePhonemeExpr(final String ph) {
         if (ph.startsWith("(")) { // we have a bracketed list of options
             if (!ph.endsWith(")")) {
                 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
             }

             final List<Phoneme> phs = new ArrayList<>();
             final String body = ph.substring(1, ph.length() - 1);
             for (final String part : body.split("[|]")) {
                 phs.add(parsePhoneme(part));
             }
             if (body.startsWith("|") || body.endsWith("|")) {
                 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
             }

             return new PhonemeList(phs);
         }
         return parsePhoneme(ph);
     }

     private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
         final Map<String, List<Rule>> lines = new HashMap<>();
         int currentLine = 0;

         boolean inMultilineComment = false;
         while (scanner.hasNextLine()) {
             currentLine++;
             final String rawLine = scanner.nextLine();
             String line = rawLine;

             if (inMultilineComment) {
                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
                     inMultilineComment = false;
                 }
             } else {
                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
                     inMultilineComment = true;
                 } else {
                     // discard comments
                     final int cmtI = line.indexOf(ResourceConstants.CMT);
                     if (cmtI >= 0) {
                         line = line.substring(0, cmtI);
                     }

                     // trim leading-trailing whitespace
                     line = line.trim();

                     if (line.isEmpty()) {
                         continue; // empty lines can be safely skipped
                     }

                     if (line.startsWith(HASH_INCLUDE)) {
                         // include statement
                         final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
                         if (incl.contains(" ")) {
                             throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
                                                                location);
                         }
                         try (final Scanner hashIncludeScanner = createScanner(incl)) {
                             lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
                         }
                     } else {
                         // rule
                         final String[] parts = line.split("\\s+");
                         if (parts.length != 4) {
                             throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
                                                                " parts: " + rawLine + " in " + location);
                         }
                         try {
                             final String pat = stripQuotes(parts[0]);
                             final String lCon = stripQuotes(parts[1]);
                             final String rCon = stripQuotes(parts[2]);
                             final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
                             final int cLine = currentLine;
                             final Rule r = new Rule(pat, lCon, rCon, ph) {
                                 private final int myLine = cLine;
                                 private final String loc = location;

                                 @Override
                                 public String toString() {
                                     final StringBuilder sb = new StringBuilder();
                                     sb.append("Rule");
                                     sb.append("{line=").append(myLine);
                                     sb.append(", loc='").append(loc).append('\'');
                                     sb.append(", pat='").append(pat).append('\'');
                                     sb.append(", lcon='").append(lCon).append('\'');
                                     sb.append(", rcon='").append(rCon).append('\'');
                                     sb.append('}');
                                     return sb.toString();
                                 }
                             };
                             final String patternKey = r.pattern.substring(0,1);
                             List<Rule> rules = lines.get(patternKey);
                             if (rules == null) {
                                 rules = new ArrayList<>();
                                 lines.put(patternKey, rules);
                             }
                             rules.add(r);
                         } catch (final IllegalArgumentException e) {
                             throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
                                                             location, e);
                         }
                     }
                 }
             }
         }

         return lines;
     }

     /**
      * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
      *
      * @param regex
      *            the regular expression to compile
      * @return an RPattern that will match this regex
      */
     private static RPattern pattern(final String regex) {
         final boolean startsWith = regex.startsWith("^");
         final boolean endsWith = regex.endsWith("$");
         final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
         final boolean boxes = content.contains("[");

         if (!boxes) {
             if (startsWith && endsWith) {
                 // exact match
                 if (content.isEmpty()) {
                     // empty
                     return new RPattern() {
                         @Override
                         public boolean isMatch(final CharSequence input) {
                             return input.length() == 0;
                         }
                     };
                 }
                 return new RPattern() {
                     @Override
                     public boolean isMatch(final CharSequence input) {
                         return input.equals(content);
                     }
                 };
             }
             if ((startsWith || endsWith) && content.isEmpty()) {
                 // matches every string
                 return ALL_STRINGS_RMATCHER;
             }
             if (startsWith) {
                 // matches from start
                 return new RPattern() {
                     @Override
                     public boolean isMatch(final CharSequence input) {
                         return startsWith(input, content);
                     }
                 };
             }
             if (endsWith) {
                 // matches from start
                 return new RPattern() {
                     @Override
                     public boolean isMatch(final CharSequence input) {
                         return endsWith(input, content);
                     }
                 };
             }
         } else {
             final boolean startsWithBox = content.startsWith("[");
             final boolean endsWithBox = content.endsWith("]");

             if (startsWithBox && endsWithBox) {
                 String boxContent = content.substring(1, content.length() - 1);
                 if (!boxContent.contains("[")) {
                     // box containing alternatives
                     final boolean negate = boxContent.startsWith("^");
                     if (negate) {
                         boxContent = boxContent.substring(1);
                     }
                     final String bContent = boxContent;
                     final boolean shouldMatch = !negate;

                     if (startsWith && endsWith) {
                         // exact match
                         return new RPattern() {
                             @Override
                             public boolean isMatch(final CharSequence input) {
                                 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
                             }
                         };
                     }
                     if (startsWith) {
                         // first char
                         return new RPattern() {
                             @Override
                             public boolean isMatch(final CharSequence input) {
                                 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
                             }
                         };
                     }
                     if (endsWith) {
                         // last char
                         return new RPattern() {
                             @Override
                             public boolean isMatch(final CharSequence input) {
                                 return input.length() > 0 &&
                                        contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
                             }
                         };
                     }
                 }
             }
         }

         return new RPattern() {
             final Pattern pattern = Pattern.compile(regex);

             @Override
             public boolean isMatch(final CharSequence input) {
                 final Matcher matcher = pattern.matcher(input);
                 return matcher.find();
             }
         };
     }

     private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
         if (prefix.length() > input.length()) {
             return false;
         }
         for (int i = 0; i < prefix.length(); i++) {
             if (input.charAt(i) != prefix.charAt(i)) {
                 return false;
             }
         }
         return true;
     }

     private static String stripQuotes(String str) {
         if (str.startsWith(DOUBLE_QUOTE)) {
             str = str.substring(1);
         }

         if (str.endsWith(DOUBLE_QUOTE)) {
             str = str.substring(0, str.length() - 1);
         }

         return str;
     }

     private final RPattern lContext;

     private final String pattern;

     private final PhonemeExpr phoneme;

     private final RPattern rContext;

     /**
      * Creates a new rule.
      *
      * @param pattern
      *            the pattern
      * @param lContext
      *            the left context
      * @param rContext
      *            the right context
      * @param phoneme
      *            the resulting phoneme
      */
     public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
         this.pattern = pattern;
         this.lContext = pattern(lContext + "$");
         this.rContext = pattern("^" + rContext);
         this.phoneme = phoneme;
     }

     /**
      * Gets the left context. This is a regular expression that must match to the left of the pattern.
      *
      * @return the left context Pattern
      */
     public RPattern getLContext() {
         return this.lContext;
     }

     /**
      * Gets the pattern. This is a string-literal that must exactly match.
      *
      * @return the pattern
      */
     public String getPattern() {
         return this.pattern;
     }

     /**
      * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
      *
      * @return the phoneme
      */
     public PhonemeExpr getPhoneme() {
         return this.phoneme;
     }

     /**
      * Gets the right context. This is a regular expression that must match to the right of the pattern.
      *
      * @return the right context Pattern
      */
     public RPattern getRContext() {
         return this.rContext;
     }

     /**
      * Decides if the pattern and context match the input starting at a position. It is a match if the
      * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and
      * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
      *
      * @param input
      *            the input String
      * @param i
      *            the int position within the input
      * @return true if the pattern and left/right context match, false otherwise
      */
     public boolean patternAndContextMatches(final CharSequence input, final int i) {
         if (i < 0) {
             throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
         }

         final int patternLength = this.pattern.length();
         final int ipl = i + patternLength;

         if (ipl > input.length()) {
             // not enough room for the pattern to match
             return false;
         }

         // evaluate the pattern, left context and right context
         // fail early if any of the evaluations is not successful
         if (!input.subSequence(i, ipl).equals(this.pattern)) {
             return false;
         }
         if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
             return false;
         }
         return this.lContext.isMatch(input.subSequence(0, i));
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.commons.codec.language.bm;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.EnumMap;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Scanner;
	import java.util.Set;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.commons.codec.Resources;
	import org.apache.commons.codec.language.bm.Languages.LanguageSet;

	/**
	* A phoneme rule.
	* <p>
	* Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
	* and a logical flag indicating if all languages must be in play. A rule matches if:
	* <ul>
	* <li>the pattern matches at the current position</li>
	* <li>the string up until the beginning of the pattern matches the left context</li>
	* <li>the string from the end of the pattern matches the right context</li>
	* <li>logical is ALL and all languages are in scope; or</li>
	* <li>logical is any other value and at least one language is in scope</li>
	* </ul>
	* <p>
	* Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
	* to explicitly construct their own.
	* <p>
	* Rules are immutable and thread-safe.
	* <p>
	* <b>Rules resources</b>
	* <p>
	* Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
	* named following the pattern:
	* <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
	* <p>
	* The format of these resources is the following:
	* <ul>
	* <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
	* will be interpreted as:
	* <ol>
	* <li>pattern</li>
	* <li>left context</li>
	* <li>right context</li>
	* <li>phoneme</li>
	* </ol>
	* </li>
	* <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
	* as a comment.</li>
	* <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
	* all content until a line ending in '*' and '/' is found.</li>
	* <li><b>Blank lines:</b> All blank lines will be skipped.</li>
	* </ul>
	*
	* @since 1.6
	*/
	public class Rule {

	public static final class Phoneme implements PhonemeExpr {
	public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
	@Override
	public int compare(final Phoneme o1, final Phoneme o2) {
	final int o1Length = o1.phonemeText.length();
	final int o2Length = o2.phonemeText.length();
	for (int i = 0; i < o1Length; i++) {
	if (i >= o2Length) {
	return +1;
	}
	final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
	if (c != 0) {
	return c;
	}
	}

	if (o1Length < o2Length) {
	return -1;
	}

	return 0;
	}
	};

	private final StringBuilder phonemeText;
	private final Languages.LanguageSet languages;

	public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
	this.phonemeText = new StringBuilder(phonemeText);
	this.languages = languages;
	}

	public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
	this(phonemeLeft.phonemeText, phonemeLeft.languages);
	this.phonemeText.append(phonemeRight.phonemeText);
	}

	public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
	this(phonemeLeft.phonemeText, languages);
	this.phonemeText.append(phonemeRight.phonemeText);
	}

	public Phoneme append(final CharSequence str) {
	this.phonemeText.append(str);
	return this;
	}

	public Languages.LanguageSet getLanguages() {
	return this.languages;
	}

	@Override
	public Iterable<Phoneme> getPhonemes() {
	return Collections.singleton(this);
	}

	public CharSequence getPhonemeText() {
	return this.phonemeText;
	}

	/**
	* Deprecated since 1.9.
	*
	* @param right the Phoneme to join
	* @return a new Phoneme
	* @deprecated since 1.9
	*/
	@Deprecated
	public Phoneme join(final Phoneme right) {
	return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
	this.languages.restrictTo(right.languages));
	}

	/**
	* Returns a new Phoneme with the same text but a union of its
	* current language set and the given one.
	*
	* @param lang the language set to merge
	* @return a new Phoneme
	*/
	public Phoneme mergeWithLanguage(final LanguageSet lang) {
	return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
	}

	@Override
	public String toString() {
	return phonemeText.toString() + "[" + languages + "]";
	}
	}

	public interface PhonemeExpr {
	Iterable<Phoneme> getPhonemes();
	}

	public static final class PhonemeList implements PhonemeExpr {
	private final List<Phoneme> phonemes;

	public PhonemeList(final List<Phoneme> phonemes) {
	this.phonemes = phonemes;
	}

	@Override
	public List<Phoneme> getPhonemes() {
	return this.phonemes;
	}
	}

	/**
	* A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
	*/
	public interface RPattern {
	boolean isMatch(CharSequence input);
	}

	public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
	@Override
	public boolean isMatch(final CharSequence input) {
	return true;
	}
	};

	public static final String ALL = "ALL";

	private static final String DOUBLE_QUOTE = "\"";

	private static final String HASH_INCLUDE = "#include";

	private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();


	private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
	new EnumMap<>(NameType.class);

	static {
	for (final NameType s : NameType.values()) {
	final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
	new EnumMap<>(RuleType.class);

	for (final RuleType rt : RuleType.values()) {
	final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();

	final Languages ls = Languages.getInstance(s);
	for (final String l : ls.getLanguages()) {
	try (final Scanner scanner = createScanner(s, rt, l)) {
	rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
	} catch (final IllegalStateException e) {
	throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
	}
	}
	if (!rt.equals(RuleType.RULES)) {
	try (final Scanner scanner = createScanner(s, rt, "common")) {
	rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
	}
	}

	rts.put(rt, Collections.unmodifiableMap(rs));
	}

	RULES.put(s, Collections.unmodifiableMap(rts));
	}
	}

	private static boolean contains(final CharSequence chars, final char input) {
	for (int i = 0; i < chars.length(); i++) {
	if (chars.charAt(i) == input) {
	return true;
	}
	}
	return false;
	}

	private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
	return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
	nameType.getName(), rt.getName(), lang);
	}

	private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
	final String resName = createResourceName(nameType, rt, lang);
	return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
	}

	private static Scanner createScanner(final String lang) {
	final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
	return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
	}

	private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
	final int suffixLength = suffix.length();
	final int inputLength = input.length();

	if (suffixLength > inputLength) {
	return false;
	}
	for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
	if (input.charAt(i) != suffix.charAt(j)) {
	return false;
	}
	}
	return true;
	}

	/**
	* Gets rules for a combination of name type, rule type and languages.
	*
	* @param nameType
	* the NameType to consider
	* @param rt
	* the RuleType to consider
	* @param langs
	* the set of languages to consider
	* @return a list of Rules that apply
	*/
	public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
	final Languages.LanguageSet langs) {
	final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
	final List<Rule> allRules = new ArrayList<>();
	for (final List<Rule> rules : ruleMap.values()) {
	allRules.addAll(rules);
	}
	return allRules;
	}

	/**
	* Gets rules for a combination of name type, rule type and a single language.
	*
	* @param nameType
	* the NameType to consider
	* @param rt
	* the RuleType to consider
	* @param lang
	* the language to consider
	* @return a list of Rules that apply
	*/
	public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
	return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
	}

	/**
	* Gets rules for a combination of name type, rule type and languages.
	*
	* @param nameType
	* the NameType to consider
	* @param rt
	* the RuleType to consider
	* @param langs
	* the set of languages to consider
	* @return a map containing all Rules that apply, grouped by the first character of the rule pattern
	* @since 1.9
	*/
	public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
	final Languages.LanguageSet langs) {
	return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
	getInstanceMap(nameType, rt, Languages.ANY);
	}

	/**
	* Gets rules for a combination of name type, rule type and a single language.
	*
	* @param nameType
	* the NameType to consider
	* @param rt
	* the RuleType to consider
	* @param lang
	* the language to consider
	* @return a map containing all Rules that apply, grouped by the first character of the rule pattern
	* @since 1.9
	*/
	public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
	final String lang) {
	final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);

	if (rules == null) {
	throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
	nameType.getName(), rt.getName(), lang));
	}

	return rules;
	}

	private static Phoneme parsePhoneme(final String ph) {
	final int open = ph.indexOf("[");
	if (open >= 0) {
	if (!ph.endsWith("]")) {
	throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
	}
	final String before = ph.substring(0, open);
	final String in = ph.substring(open + 1, ph.length() - 1);
	final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));

	return new Phoneme(before, Languages.LanguageSet.from(langs));
	}
	return new Phoneme(ph, Languages.ANY_LANGUAGE);
	}

	private static PhonemeExpr parsePhonemeExpr(final String ph) {
	if (ph.startsWith("(")) { // we have a bracketed list of options
	if (!ph.endsWith(")")) {
	throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
	}

	final List<Phoneme> phs = new ArrayList<>();
	final String body = ph.substring(1, ph.length() - 1);
	for (final String part : body.split("[\|]")) {
	phs.add(parsePhoneme(part));
	}
	if (body.startsWith("\|") \|\| body.endsWith("\|")) {
	phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
	}

	return new PhonemeList(phs);
	}
	return parsePhoneme(ph);
	}

	private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
	final Map<String, List<Rule>> lines = new HashMap<>();
	int currentLine = 0;

	boolean inMultilineComment = false;
	while (scanner.hasNextLine()) {
	currentLine++;
	final String rawLine = scanner.nextLine();
	String line = rawLine;

	if (inMultilineComment) {
	if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
	inMultilineComment = false;
	}
	} else {
	if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
	inMultilineComment = true;
	} else {
	// discard comments
	final int cmtI = line.indexOf(ResourceConstants.CMT);
	if (cmtI >= 0) {
	line = line.substring(0, cmtI);
	}

	// trim leading-trailing whitespace
	line = line.trim();

	if (line.isEmpty()) {
	continue; // empty lines can be safely skipped
	}

	if (line.startsWith(HASH_INCLUDE)) {
	// include statement
	final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
	if (incl.contains(" ")) {
	throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
	location);
	}
	try (final Scanner hashIncludeScanner = createScanner(incl)) {
	lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
	}
	} else {
	// rule
	final String[] parts = line.split("\\s+");
	if (parts.length != 4) {
	throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
	" parts: " + rawLine + " in " + location);
	}
	try {
	final String pat = stripQuotes(parts[0]);
	final String lCon = stripQuotes(parts[1]);
	final String rCon = stripQuotes(parts[2]);
	final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
	final int cLine = currentLine;
	final Rule r = new Rule(pat, lCon, rCon, ph) {
	private final int myLine = cLine;
	private final String loc = location;

	@Override
	public String toString() {
	final StringBuilder sb = new StringBuilder();
	sb.append("Rule");
	sb.append("{line=").append(myLine);
	sb.append(", loc='").append(loc).append('\'');
	sb.append(", pat='").append(pat).append('\'');
	sb.append(", lcon='").append(lCon).append('\'');
	sb.append(", rcon='").append(rCon).append('\'');
	sb.append('}');
	return sb.toString();
	}
	};
	final String patternKey = r.pattern.substring(0,1);
	List<Rule> rules = lines.get(patternKey);
	if (rules == null) {
	rules = new ArrayList<>();
	lines.put(patternKey, rules);
	}
	rules.add(r);
	} catch (final IllegalArgumentException e) {
	throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
	location, e);
	}
	}
	}
	}
	}

	return lines;
	}

	/**
	* Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
	*
	* @param regex
	* the regular expression to compile
	* @return an RPattern that will match this regex
	*/
	private static RPattern pattern(final String regex) {
	final boolean startsWith = regex.startsWith("^");
	final boolean endsWith = regex.endsWith("$");
	final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
	final boolean boxes = content.contains("[");

	if (!boxes) {
	if (startsWith && endsWith) {
	// exact match
	if (content.isEmpty()) {
	// empty
	return new RPattern() {
	@Override
	public boolean isMatch(final CharSequence input) {
	return input.length() == 0;
	}
	};
	}
	return new RPattern() {
	@Override
	public boolean isMatch(final CharSequence input) {
	return input.equals(content);
	}
	};
	}
	if ((startsWith \|\| endsWith) && content.isEmpty()) {
	// matches every string
	return ALL_STRINGS_RMATCHER;
	}
	if (startsWith) {
	// matches from start
	return new RPattern() {
	@Override
	public boolean isMatch(final CharSequence input) {
	return startsWith(input, content);
	}
	};
	}
	if (endsWith) {
	// matches from start
	return new RPattern() {
	@Override
	public boolean isMatch(final CharSequence input) {
	return endsWith(input, content);
	}
	};
	}
	} else {
	final boolean startsWithBox = content.startsWith("[");
	final boolean endsWithBox = content.endsWith("]");

	if (startsWithBox && endsWithBox) {
	String boxContent = content.substring(1, content.length() - 1);
	if (!boxContent.contains("[")) {
	// box containing alternatives
	final boolean negate = boxContent.startsWith("^");
	if (negate) {
	boxContent = boxContent.substring(1);
	}
	final String bContent = boxContent;
	final boolean shouldMatch = !negate;

	if (startsWith && endsWith) {
	// exact match
	return new RPattern() {
	@Override
	public boolean isMatch(final CharSequence input) {
	return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
	}
	};
	}
	if (startsWith) {
	// first char
	return new RPattern() {
	@Override
	public boolean isMatch(final CharSequence input) {
	return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
	}
	};
	}
	if (endsWith) {
	// last char
	return new RPattern() {
	@Override
	public boolean isMatch(final CharSequence input) {
	return input.length() > 0 &&
	contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
	}
	};
	}
	}
	}
	}

	return new RPattern() {
	final Pattern pattern = Pattern.compile(regex);

	@Override
	public boolean isMatch(final CharSequence input) {
	final Matcher matcher = pattern.matcher(input);
	return matcher.find();
	}
	};
	}

	private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
	if (prefix.length() > input.length()) {
	return false;
	}
	for (int i = 0; i < prefix.length(); i++) {
	if (input.charAt(i) != prefix.charAt(i)) {
	return false;
	}
	}
	return true;
	}

	private static String stripQuotes(String str) {
	if (str.startsWith(DOUBLE_QUOTE)) {
	str = str.substring(1);
	}

	if (str.endsWith(DOUBLE_QUOTE)) {
	str = str.substring(0, str.length() - 1);
	}

	return str;
	}

	private final RPattern lContext;

	private final String pattern;

	private final PhonemeExpr phoneme;

	private final RPattern rContext;

	/**
	* Creates a new rule.
	*
	* @param pattern
	* the pattern
	* @param lContext
	* the left context
	* @param rContext
	* the right context
	* @param phoneme
	* the resulting phoneme
	*/
	public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
	this.pattern = pattern;
	this.lContext = pattern(lContext + "$");
	this.rContext = pattern("^" + rContext);
	this.phoneme = phoneme;
	}

	/**
	* Gets the left context. This is a regular expression that must match to the left of the pattern.
	*
	* @return the left context Pattern
	*/
	public RPattern getLContext() {
	return this.lContext;
	}

	/**
	* Gets the pattern. This is a string-literal that must exactly match.
	*
	* @return the pattern
	*/
	public String getPattern() {
	return this.pattern;
	}

	/**
	* Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
	*
	* @return the phoneme
	*/
	public PhonemeExpr getPhoneme() {
	return this.phoneme;
	}

	/**
	* Gets the right context. This is a regular expression that must match to the right of the pattern.
	*
	* @return the right context Pattern
	*/
	public RPattern getRContext() {
	return this.rContext;
	}

	/**
	* Decides if the pattern and context match the input starting at a position. It is a match if the
	* {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and
	* {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
	*
	* @param input
	* the input String
	* @param i
	* the int position within the input
	* @return true if the pattern and left/right context match, false otherwise
	*/
	public boolean patternAndContextMatches(final CharSequence input, final int i) {
	if (i < 0) {
	throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
	}

	final int patternLength = this.pattern.length();
	final int ipl = i + patternLength;

	if (ipl > input.length()) {
	// not enough room for the pattern to match
	return false;
	}

	// evaluate the pattern, left context and right context
	// fail early if any of the evaluations is not successful
	if (!input.subSequence(i, ipl).equals(this.pattern)) {
	return false;
	}
	if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
	return false;
	}
	return this.lContext.isMatch(input.subSequence(0, i));
	}
	}