blob: f46f5449942cf43b0bf776b235cdc5d53cf35cef [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.codec.language.bm;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.codec.Resources;
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
/**
* A phoneme rule.
* <p>
* Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
* and a logical flag indicating if all languages must be in play. A rule matches if:
* <ul>
* <li>the pattern matches at the current position</li>
* <li>the string up until the beginning of the pattern matches the left context</li>
* <li>the string from the end of the pattern matches the right context</li>
* <li>logical is ALL and all languages are in scope; or</li>
* <li>logical is any other value and at least one language is in scope</li>
* </ul>
* <p>
* Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
* to explicitly construct their own.
* <p>
* Rules are immutable and thread-safe.
* <p>
* <b>Rules resources</b>
* <p>
* Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
* named following the pattern:
* <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
* <p>
* The format of these resources is the following:
* <ul>
* <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
* will be interpreted as:
* <ol>
* <li>pattern</li>
* <li>left context</li>
* <li>right context</li>
* <li>phoneme</li>
* </ol>
* </li>
* <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
* as a comment.</li>
* <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
* all content until a line ending in '*' and '/' is found.</li>
* <li><b>Blank lines:</b> All blank lines will be skipped.</li>
* </ul>
*
* @since 1.6
*/
public class Rule {
public static final class Phoneme implements PhonemeExpr {
public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
@Override
public int compare(final Phoneme o1, final Phoneme o2) {
final int o1Length = o1.phonemeText.length();
final int o2Length = o2.phonemeText.length();
for (int i = 0; i < o1Length; i++) {
if (i >= o2Length) {
return +1;
}
final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
if (c != 0) {
return c;
}
}
if (o1Length < o2Length) {
return -1;
}
return 0;
}
};
private final StringBuilder phonemeText;
private final Languages.LanguageSet languages;
public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
this.phonemeText = new StringBuilder(phonemeText);
this.languages = languages;
}
public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
this(phonemeLeft.phonemeText, phonemeLeft.languages);
this.phonemeText.append(phonemeRight.phonemeText);
}
public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
this(phonemeLeft.phonemeText, languages);
this.phonemeText.append(phonemeRight.phonemeText);
}
public Phoneme append(final CharSequence str) {
this.phonemeText.append(str);
return this;
}
public Languages.LanguageSet getLanguages() {
return this.languages;
}
@Override
public Iterable<Phoneme> getPhonemes() {
return Collections.singleton(this);
}
public CharSequence getPhonemeText() {
return this.phonemeText;
}
/**
* Deprecated since 1.9.
*
* @param right the Phoneme to join
* @return a new Phoneme
* @deprecated since 1.9
*/
@Deprecated
public Phoneme join(final Phoneme right) {
return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
this.languages.restrictTo(right.languages));
}
/**
* Returns a new Phoneme with the same text but a union of its
* current language set and the given one.
*
* @param lang the language set to merge
* @return a new Phoneme
*/
public Phoneme mergeWithLanguage(final LanguageSet lang) {
return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
}
@Override
public String toString() {
return phonemeText.toString() + "[" + languages + "]";
}
}
public interface PhonemeExpr {
Iterable<Phoneme> getPhonemes();
}
public static final class PhonemeList implements PhonemeExpr {
private final List<Phoneme> phonemes;
public PhonemeList(final List<Phoneme> phonemes) {
this.phonemes = phonemes;
}
@Override
public List<Phoneme> getPhonemes() {
return this.phonemes;
}
}
/**
* A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
*/
public interface RPattern {
boolean isMatch(CharSequence input);
}
public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
@Override
public boolean isMatch(final CharSequence input) {
return true;
}
};
public static final String ALL = "ALL";
private static final String DOUBLE_QUOTE = "\"";
private static final String HASH_INCLUDE = "#include";
private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
new EnumMap<>(NameType.class);
static {
for (final NameType s : NameType.values()) {
final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
new EnumMap<>(RuleType.class);
for (final RuleType rt : RuleType.values()) {
final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();
final Languages ls = Languages.getInstance(s);
for (final String l : ls.getLanguages()) {
try (final Scanner scanner = createScanner(s, rt, l)) {
rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
} catch (final IllegalStateException e) {
throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
}
}
if (!rt.equals(RuleType.RULES)) {
try (final Scanner scanner = createScanner(s, rt, "common")) {
rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
}
}
rts.put(rt, Collections.unmodifiableMap(rs));
}
RULES.put(s, Collections.unmodifiableMap(rts));
}
}
private static boolean contains(final CharSequence chars, final char input) {
for (int i = 0; i < chars.length(); i++) {
if (chars.charAt(i) == input) {
return true;
}
}
return false;
}
private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
nameType.getName(), rt.getName(), lang);
}
private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
final String resName = createResourceName(nameType, rt, lang);
return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
}
private static Scanner createScanner(final String lang) {
final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
}
private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
final int suffixLength = suffix.length();
final int inputLength = input.length();
if (suffixLength > inputLength) {
return false;
}
for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
if (input.charAt(i) != suffix.charAt(j)) {
return false;
}
}
return true;
}
/**
* Gets rules for a combination of name type, rule type and languages.
*
* @param nameType
* the NameType to consider
* @param rt
* the RuleType to consider
* @param langs
* the set of languages to consider
* @return a list of Rules that apply
*/
public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
final Languages.LanguageSet langs) {
final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
final List<Rule> allRules = new ArrayList<>();
for (final List<Rule> rules : ruleMap.values()) {
allRules.addAll(rules);
}
return allRules;
}
/**
* Gets rules for a combination of name type, rule type and a single language.
*
* @param nameType
* the NameType to consider
* @param rt
* the RuleType to consider
* @param lang
* the language to consider
* @return a list of Rules that apply
*/
public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
}
/**
* Gets rules for a combination of name type, rule type and languages.
*
* @param nameType
* the NameType to consider
* @param rt
* the RuleType to consider
* @param langs
* the set of languages to consider
* @return a map containing all Rules that apply, grouped by the first character of the rule pattern
* @since 1.9
*/
public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
final Languages.LanguageSet langs) {
return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
getInstanceMap(nameType, rt, Languages.ANY);
}
/**
* Gets rules for a combination of name type, rule type and a single language.
*
* @param nameType
* the NameType to consider
* @param rt
* the RuleType to consider
* @param lang
* the language to consider
* @return a map containing all Rules that apply, grouped by the first character of the rule pattern
* @since 1.9
*/
public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
final String lang) {
final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
if (rules == null) {
throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
nameType.getName(), rt.getName(), lang));
}
return rules;
}
private static Phoneme parsePhoneme(final String ph) {
final int open = ph.indexOf("[");
if (open >= 0) {
if (!ph.endsWith("]")) {
throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
}
final String before = ph.substring(0, open);
final String in = ph.substring(open + 1, ph.length() - 1);
final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
return new Phoneme(before, Languages.LanguageSet.from(langs));
}
return new Phoneme(ph, Languages.ANY_LANGUAGE);
}
private static PhonemeExpr parsePhonemeExpr(final String ph) {
if (ph.startsWith("(")) { // we have a bracketed list of options
if (!ph.endsWith(")")) {
throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
}
final List<Phoneme> phs = new ArrayList<>();
final String body = ph.substring(1, ph.length() - 1);
for (final String part : body.split("[|]")) {
phs.add(parsePhoneme(part));
}
if (body.startsWith("|") || body.endsWith("|")) {
phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
}
return new PhonemeList(phs);
}
return parsePhoneme(ph);
}
private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
final Map<String, List<Rule>> lines = new HashMap<>();
int currentLine = 0;
boolean inMultilineComment = false;
while (scanner.hasNextLine()) {
currentLine++;
final String rawLine = scanner.nextLine();
String line = rawLine;
if (inMultilineComment) {
if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
inMultilineComment = false;
}
} else {
if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
inMultilineComment = true;
} else {
// discard comments
final int cmtI = line.indexOf(ResourceConstants.CMT);
if (cmtI >= 0) {
line = line.substring(0, cmtI);
}
// trim leading-trailing whitespace
line = line.trim();
if (line.isEmpty()) {
continue; // empty lines can be safely skipped
}
if (line.startsWith(HASH_INCLUDE)) {
// include statement
final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
if (incl.contains(" ")) {
throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
location);
}
try (final Scanner hashIncludeScanner = createScanner(incl)) {
lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
}
} else {
// rule
final String[] parts = line.split("\\s+");
if (parts.length != 4) {
throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
" parts: " + rawLine + " in " + location);
}
try {
final String pat = stripQuotes(parts[0]);
final String lCon = stripQuotes(parts[1]);
final String rCon = stripQuotes(parts[2]);
final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
final int cLine = currentLine;
final Rule r = new Rule(pat, lCon, rCon, ph) {
private final int myLine = cLine;
private final String loc = location;
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append("Rule");
sb.append("{line=").append(myLine);
sb.append(", loc='").append(loc).append('\'');
sb.append(", pat='").append(pat).append('\'');
sb.append(", lcon='").append(lCon).append('\'');
sb.append(", rcon='").append(rCon).append('\'');
sb.append('}');
return sb.toString();
}
};
final String patternKey = r.pattern.substring(0,1);
List<Rule> rules = lines.get(patternKey);
if (rules == null) {
rules = new ArrayList<>();
lines.put(patternKey, rules);
}
rules.add(r);
} catch (final IllegalArgumentException e) {
throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
location, e);
}
}
}
}
}
return lines;
}
/**
* Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
*
* @param regex
* the regular expression to compile
* @return an RPattern that will match this regex
*/
private static RPattern pattern(final String regex) {
final boolean startsWith = regex.startsWith("^");
final boolean endsWith = regex.endsWith("$");
final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
final boolean boxes = content.contains("[");
if (!boxes) {
if (startsWith && endsWith) {
// exact match
if (content.isEmpty()) {
// empty
return new RPattern() {
@Override
public boolean isMatch(final CharSequence input) {
return input.length() == 0;
}
};
}
return new RPattern() {
@Override
public boolean isMatch(final CharSequence input) {
return input.equals(content);
}
};
}
if ((startsWith || endsWith) && content.isEmpty()) {
// matches every string
return ALL_STRINGS_RMATCHER;
}
if (startsWith) {
// matches from start
return new RPattern() {
@Override
public boolean isMatch(final CharSequence input) {
return startsWith(input, content);
}
};
}
if (endsWith) {
// matches from start
return new RPattern() {
@Override
public boolean isMatch(final CharSequence input) {
return endsWith(input, content);
}
};
}
} else {
final boolean startsWithBox = content.startsWith("[");
final boolean endsWithBox = content.endsWith("]");
if (startsWithBox && endsWithBox) {
String boxContent = content.substring(1, content.length() - 1);
if (!boxContent.contains("[")) {
// box containing alternatives
final boolean negate = boxContent.startsWith("^");
if (negate) {
boxContent = boxContent.substring(1);
}
final String bContent = boxContent;
final boolean shouldMatch = !negate;
if (startsWith && endsWith) {
// exact match
return new RPattern() {
@Override
public boolean isMatch(final CharSequence input) {
return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
}
};
}
if (startsWith) {
// first char
return new RPattern() {
@Override
public boolean isMatch(final CharSequence input) {
return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
}
};
}
if (endsWith) {
// last char
return new RPattern() {
@Override
public boolean isMatch(final CharSequence input) {
return input.length() > 0 &&
contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
}
};
}
}
}
}
return new RPattern() {
final Pattern pattern = Pattern.compile(regex);
@Override
public boolean isMatch(final CharSequence input) {
final Matcher matcher = pattern.matcher(input);
return matcher.find();
}
};
}
private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
if (prefix.length() > input.length()) {
return false;
}
for (int i = 0; i < prefix.length(); i++) {
if (input.charAt(i) != prefix.charAt(i)) {
return false;
}
}
return true;
}
private static String stripQuotes(String str) {
if (str.startsWith(DOUBLE_QUOTE)) {
str = str.substring(1);
}
if (str.endsWith(DOUBLE_QUOTE)) {
str = str.substring(0, str.length() - 1);
}
return str;
}
private final RPattern lContext;
private final String pattern;
private final PhonemeExpr phoneme;
private final RPattern rContext;
/**
* Creates a new rule.
*
* @param pattern
* the pattern
* @param lContext
* the left context
* @param rContext
* the right context
* @param phoneme
* the resulting phoneme
*/
public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
this.pattern = pattern;
this.lContext = pattern(lContext + "$");
this.rContext = pattern("^" + rContext);
this.phoneme = phoneme;
}
/**
* Gets the left context. This is a regular expression that must match to the left of the pattern.
*
* @return the left context Pattern
*/
public RPattern getLContext() {
return this.lContext;
}
/**
* Gets the pattern. This is a string-literal that must exactly match.
*
* @return the pattern
*/
public String getPattern() {
return this.pattern;
}
/**
* Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
*
* @return the phoneme
*/
public PhonemeExpr getPhoneme() {
return this.phoneme;
}
/**
* Gets the right context. This is a regular expression that must match to the right of the pattern.
*
* @return the right context Pattern
*/
public RPattern getRContext() {
return this.rContext;
}
/**
* Decides if the pattern and context match the input starting at a position. It is a match if the
* {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and
* {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
*
* @param input
* the input String
* @param i
* the int position within the input
* @return true if the pattern and left/right context match, false otherwise
*/
public boolean patternAndContextMatches(final CharSequence input, final int i) {
if (i < 0) {
throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
}
final int patternLength = this.pattern.length();
final int ipl = i + patternLength;
if (ipl > input.length()) {
// not enough room for the pattern to match
return false;
}
// evaluate the pattern, left context and right context
// fail early if any of the evaluations is not successful
if (!input.subSequence(i, ipl).equals(this.pattern)) {
return false;
}
if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
return false;
}
return this.lContext.isMatch(input.subSequence(0, i));
}
}