| // commons-codec version compatibility level: 1.9 |
| using J2N; |
| using J2N.Collections.Generic.Extensions; |
| using J2N.Text; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| using System.Text; |
| using System.Text.RegularExpressions; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Analysis.Phonetic.Language.Bm |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// A phoneme rule. |
| /// </summary> |
| /// <remarks> |
| /// Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply |
| /// and a logical flag indicating if all languages must be in play. A rule matches if: |
| /// <list type="bullet"> |
| /// <item><description>the pattern matches at the current position</description></item> |
| /// <item><description>the string up until the beginning of the pattern matches the left context</description></item> |
| /// <item><description>the string from the end of the pattern matches the right context</description></item> |
| /// <item><description>logical is ALL and all languages are in scope; or</description></item> |
| /// <item><description>logical is any other value and at least one language is in scope</description></item> |
| /// </list> |
| /// <para/> |
| /// Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user |
| /// to explicitly construct their own. |
| /// <para/> |
| /// Rules are immutable and thread-safe. |
| /// <para/> |
| /// <b>Rules resources</b> |
| /// <para/> |
| /// Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically |
| /// named following the pattern: |
| /// <c>Lucene.Net.Analysis.Phonetic.Language.Bm.<see cref="NameType"/>_<see cref="RuleType"/>_[language].txt</c> |
| /// <para/> |
| /// The format of these resources is the following: |
| /// <list type="table"> |
| /// <item> |
| /// <term>Rules:</term> |
| /// <description> |
| /// whitespace separated, double-quoted strings. There should be 4 columns to each row, and these |
| /// will be interpreted as: |
| /// <list type="number"> |
| /// <item><description>pattern</description></item> |
| /// <item><description>left context</description></item> |
| /// <item><description>right context</description></item> |
| /// <item><description>phoneme</description></item> |
| /// </list> |
| /// </description> |
| /// </item> |
| /// <item> |
| /// <term>End-of-line comments:</term> |
| /// <description>Any occurrence of '//' will cause all text following on that line to be discarded as a comment.</description> |
| /// </item> |
| /// <item> |
| /// <term>Multi-line comments:</term> |
| /// <description>Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a line ending in '*' and '/' is found.</description> |
| /// </item> |
| /// <item> |
| /// <term>Blank lines:</term> |
| /// <description>All blank lines will be skipped.</description> |
| /// </item> |
| /// </list> |
| /// <para/> |
| /// since 1.6 |
| /// </remarks> |
| public class Rule |
| { |
| private static readonly Regex PIPE = new Regex("[|]", RegexOptions.Compiled); |
| private static readonly Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled); |
| private static readonly Regex PLUS = new Regex("[+]", RegexOptions.Compiled); |
| |
| private class AllStringsRMatcher : IRPattern |
| { |
| public bool IsMatch(StringBuilder input) |
| { |
| return true; |
| } |
| |
| public bool IsMatch(string input) |
| { |
| return true; |
| } |
| |
| public bool IsMatch(ICharSequence input) |
| { |
| return true; |
| } |
| } |
| |
| public static readonly IRPattern ALL_STRINGS_RMATCHER = new AllStringsRMatcher(); |
| |
| |
| public const string ALL = "ALL"; |
| |
| private const string DOUBLE_QUOTE = "\""; |
| |
| private const string HASH_INCLUDE = "#include"; |
| |
| private static readonly IDictionary<NameType, IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>> RULES = LoadRules(); |
| |
| private static IDictionary<NameType, IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>> LoadRules() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) |
| { |
| var rules = new Dictionary<NameType, IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>>(); |
| foreach (NameType s in Enum.GetValues(typeof(NameType))) |
| { |
| IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>> rts = |
| new Dictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>(); |
| |
| foreach (RuleType rt in Enum.GetValues(typeof(RuleType))) |
| { |
| IDictionary<string, IDictionary<string, IList<Rule>>> rs = new Dictionary<string, IDictionary<string, IList<Rule>>>(); |
| |
| Languages ls = Languages.GetInstance(s); |
| foreach (string l in ls.GetLanguages()) |
| { |
| try |
| { |
| rs[l] = ParseRules(CreateScanner(s, rt, l), CreateResourceName(s, rt, l)); |
| } |
| catch (InvalidOperationException e) |
| { |
| throw new InvalidOperationException("Problem processing " + CreateResourceName(s, rt, l), e); |
| } |
| } |
| if (!rt.Equals(RuleType.RULES)) |
| { |
| rs["common"] = ParseRules(CreateScanner(s, rt, "common"), CreateResourceName(s, rt, "common")); |
| } |
| |
| rts[rt] = rs.AsReadOnly(); |
| } |
| |
| rules[s] = rts.AsReadOnly(); |
| } |
| return rules; |
| } |
| |
| #pragma warning disable IDE0051 // Remove unused private members |
| private static bool Contains(ICharSequence chars, char input) |
| #pragma warning restore IDE0051 // Remove unused private members |
| { |
| for (int i = 0; i < chars.Length; i++) |
| { |
| if (chars[i] == input) |
| { |
| return true; |
| } |
| } |
| return false; |
| } |
| private static bool Contains(string chars, char input) |
| { |
| for (int i = 0; i < chars.Length; i++) |
| { |
| if (chars[i] == input) |
| { |
| return true; |
| } |
| } |
| return false; |
| } |
| #pragma warning disable IDE0051 // Remove unused private members |
| private static bool Contains(StringBuilder chars, char input) |
| #pragma warning restore IDE0051 // Remove unused private members |
| { |
| for (int i = 0; i < chars.Length; i++) |
| { |
| if (chars[i] == input) |
| { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| private static string CreateResourceName(NameType nameType, RuleType rt, string lang) |
| { |
| return string.Format("{0}_{1}_{2}.txt", |
| nameType.GetName(), rt.GetName(), lang); |
| } |
| |
| private static TextReader CreateScanner(NameType nameType, RuleType rt, string lang) |
| { |
| string resName = CreateResourceName(nameType, rt, lang); |
| Stream rulesIS = typeof(Languages).FindAndGetManifestResourceStream(resName); |
| |
| if (rulesIS == null) |
| { |
| throw new ArgumentException("Unable to load resource: " + resName); |
| } |
| |
| return new StreamReader(rulesIS, ResourceConstants.ENCODING); |
| } |
| |
| private static TextReader CreateScanner(string lang) |
| { |
| string resName = string.Format("{0}.txt", lang); |
| Stream rulesIS = typeof(Languages).FindAndGetManifestResourceStream(resName); |
| |
| if (rulesIS == null) |
| { |
| throw new ArgumentException("Unable to load resource: " + resName); |
| } |
| |
| return new StreamReader(rulesIS, ResourceConstants.ENCODING); |
| } |
| |
| private static bool EndsWith(ICharSequence input, string suffix) |
| { |
| if (suffix.Length > input.Length) |
| { |
| return false; |
| } |
| for (int i = input.Length - 1, j = suffix.Length - 1; j >= 0; i--, j--) |
| { |
| if (input[i] != suffix[j]) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private static bool EndsWith(string input, string suffix) |
| { |
| if (suffix.Length > input.Length) |
| { |
| return false; |
| } |
| for (int i = input.Length - 1, j = suffix.Length - 1; j >= 0; i--, j--) |
| { |
| if (input[i] != suffix[j]) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private static bool EndsWith(StringBuilder input, string suffix) |
| { |
| if (suffix.Length > input.Length) |
| { |
| return false; |
| } |
| for (int i = input.Length - 1, j = suffix.Length - 1; j >= 0; i--, j--) |
| { |
| if (input[i] != suffix[j]) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /// <summary> |
| /// Gets rules for a combination of name type, rule type and languages. |
| /// </summary> |
| /// <param name="nameType">The <see cref="NameType"/> to consider.</param> |
| /// <param name="rt">The <see cref="RuleType"/> to consider.</param> |
| /// <param name="langs">The set of languages to consider.</param> |
| /// <returns>A list of <see cref="Rule"/>s that apply.</returns> |
| public static IList<Rule> GetInstance(NameType nameType, RuleType rt, |
| LanguageSet langs) |
| { |
| IDictionary<string, IList<Rule>> ruleMap = GetInstanceMap(nameType, rt, langs); |
| IList<Rule> allRules = new List<Rule>(); |
| foreach (IList<Rule> rules in ruleMap.Values) |
| { |
| allRules.AddRange(rules); |
| } |
| return allRules; |
| } |
| |
| /// <summary> |
| /// Gets rules for a combination of name type, rule type and a single language. |
| /// </summary> |
| /// <param name="nameType">The <see cref="NameType"/> to consider.</param> |
| /// <param name="rt">The <see cref="RuleType"/> to consider.</param> |
| /// <param name="lang">The language to consider.</param> |
| /// <returns>A list of <see cref="Rule"/>s that apply.</returns> |
| public static IList<Rule> GetInstance(NameType nameType, RuleType rt, string lang) |
| { |
| return GetInstance(nameType, rt, LanguageSet.From(new JCG.HashSet<string>() { lang })); |
| } |
| |
| /// <summary> |
| /// Gets rules for a combination of name type, rule type and languages. |
| /// <para/> |
| /// since 1.9 |
| /// </summary> |
| /// <param name="nameType">The <see cref="NameType"/> to consider.</param> |
| /// <param name="rt">The <see cref="RuleType"/> to consider.</param> |
| /// <param name="langs">The set of languages to consider.</param> |
| /// <returns>A map containing all <see cref="Rule"/>s that apply, grouped by the first character of the rule pattern.</returns> |
| public static IDictionary<string, IList<Rule>> GetInstanceMap(NameType nameType, RuleType rt, |
| LanguageSet langs) |
| { |
| return langs.IsSingleton ? GetInstanceMap(nameType, rt, langs.GetAny()) : |
| GetInstanceMap(nameType, rt, Languages.ANY); |
| } |
| |
| /// <summary> |
| /// Gets rules for a combination of name type, rule type and a single language. |
| /// <para/> |
| /// since 1.9 |
| /// </summary> |
| /// <param name="nameType">The <see cref="NameType"/> to consider.</param> |
| /// <param name="rt">The <see cref="RuleType"/> to consider.</param> |
| /// <param name="lang">The language to consider.</param> |
| /// <returns>A map containing all <see cref="Rule"/>s that apply, grouped by the first character of the rule pattern.</returns> |
| public static IDictionary<string, IList<Rule>> GetInstanceMap(NameType nameType, RuleType rt, |
| string lang) |
| { |
| if (RULES.TryGetValue(nameType, out var nameTypes) && nameTypes != null && |
| nameTypes.TryGetValue(rt, out var ruleTypes) && ruleTypes != null && |
| ruleTypes.TryGetValue(lang, out var rules) && rules != null) |
| { |
| } |
| else |
| { |
| throw new ArgumentException(string.Format("No rules found for {0}, {1}, {2}.", |
| nameType.GetName(), rt.GetName(), lang)); |
| } |
| |
| return rules; |
| } |
| |
| private static Phoneme ParsePhoneme(string ph) |
| { |
| int open = ph.IndexOf('['); |
| if (open >= 0) |
| { |
| if (!ph.EndsWith("]", StringComparison.Ordinal)) |
| { |
| throw new ArgumentException("Phoneme expression contains a '[' but does not end in ']'"); |
| } |
| string before = ph.Substring(0, open - 0); |
| string input = ph.Substring(open + 1, (ph.Length - 1) - (open + 1)); |
| ISet<string> langs = new JCG.HashSet<string>(PLUS.Split(input).TrimEnd()); |
| |
| return new Phoneme(before, LanguageSet.From(langs)); |
| } |
| else |
| { |
| return new Phoneme(ph, Languages.ANY_LANGUAGE); |
| } |
| } |
| |
| private static IPhonemeExpr ParsePhonemeExpr(string ph) |
| { |
| if (ph.StartsWith("(", StringComparison.Ordinal)) |
| { // we have a bracketed list of options |
| if (!ph.EndsWith(")", StringComparison.Ordinal)) |
| { |
| throw new ArgumentException("Phoneme starts with '(' so must end with ')'"); |
| } |
| |
| IList<Phoneme> phs = new List<Phoneme>(); |
| string body = ph.Substring(1, (ph.Length - 1) - 1); |
| foreach (string part in PIPE.Split(body).TrimEnd()) |
| { |
| phs.Add(ParsePhoneme(part)); |
| } |
| if (body.StartsWith("|", StringComparison.Ordinal) || body.EndsWith("|", StringComparison.Ordinal)) |
| { |
| phs.Add(new Phoneme("", Languages.ANY_LANGUAGE)); |
| } |
| |
| return new PhonemeList(phs); |
| } |
| else |
| { |
| return ParsePhoneme(ph); |
| } |
| } |
| |
| private class RuleAnonymousHelper : Rule |
| { |
| private readonly int myLine; |
| private readonly string loc; |
| |
| public RuleAnonymousHelper(string pat, string lCon, string rCon, IPhonemeExpr ph, int cLine, string location) |
| : base(pat, lCon, rCon, ph) |
| { |
| this.myLine = cLine; |
| this.loc = location; |
| } |
| |
| public override string ToString() |
| { |
| StringBuilder sb = new StringBuilder(); |
| sb.Append("Rule"); |
| sb.Append("{line=").Append(myLine); |
| sb.Append(", loc='").Append(loc).Append('\''); |
| sb.Append('}'); |
| return sb.ToString(); |
| } |
| } |
| |
| private static IDictionary<string, IList<Rule>> ParseRules(TextReader reader, string location) |
| { |
| IDictionary<string, IList<Rule>> lines = new JCG.Dictionary<string, IList<Rule>>(); |
| int currentLine = 0; |
| |
| bool inMultilineComment = false; |
| string rawLine; |
| try |
| { |
| while ((rawLine = reader.ReadLine()) != null) |
| { |
| currentLine++; |
| string line = rawLine; |
| |
| if (inMultilineComment) |
| { |
| if (line.EndsWith(ResourceConstants.EXT_CMT_END, StringComparison.Ordinal)) |
| { |
| inMultilineComment = false; |
| } |
| } |
| else |
| { |
| if (line.StartsWith(ResourceConstants.EXT_CMT_START, StringComparison.Ordinal)) |
| { |
| inMultilineComment = true; |
| } |
| else |
| { |
| // discard comments |
| int cmtI = line.IndexOf(ResourceConstants.CMT, StringComparison.Ordinal); |
| if (cmtI >= 0) |
| { |
| line = line.Substring(0, cmtI); |
| } |
| |
| // trim leading-trailing whitespace |
| line = line.Trim(); |
| |
| if (line.Length == 0) |
| { |
| continue; // empty lines can be safely skipped |
| } |
| |
| if (line.StartsWith(HASH_INCLUDE, StringComparison.Ordinal)) |
| { |
| // include statement |
| string incl = line.Substring(HASH_INCLUDE.Length).Trim(); |
| if (incl.Contains(" ")) |
| { |
| throw new ArgumentException("Malformed import statement '" + rawLine + "' in " + |
| location); |
| } |
| else |
| { |
| lines.PutAll(ParseRules(CreateScanner(incl), location + "->" + incl)); |
| } |
| } |
| else |
| { |
| // rule |
| string[] parts = WHITESPACE.Split(line).TrimEnd(); |
| if (parts.Length != 4) |
| { |
| throw new ArgumentException("Malformed rule statement split into " + parts.Length + |
| " parts: " + rawLine + " in " + location); |
| } |
| else |
| { |
| try |
| { |
| string pat = StripQuotes(parts[0]); |
| string lCon = StripQuotes(parts[1]); |
| string rCon = StripQuotes(parts[2]); |
| IPhonemeExpr ph = ParsePhonemeExpr(StripQuotes(parts[3])); |
| int cLine = currentLine; |
| Rule r = new RuleAnonymousHelper(pat, lCon, rCon, ph, cLine, location); |
| |
| string patternKey = r.pattern.Substring(0, 1 - 0); |
| if (!lines.TryGetValue(patternKey, out IList<Rule> rules) || rules == null) |
| { |
| rules = new List<Rule>(); |
| lines[patternKey] = rules; |
| } |
| rules.Add(r); |
| } |
| catch (ArgumentException e) |
| { |
| throw new InvalidOperationException("Problem parsing line '" + currentLine + "' in " + |
| location, e); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| finally |
| { |
| reader.Dispose(); |
| } |
| |
| return lines; |
| } |
| |
| private class RPatternHelper : IRPattern |
| { |
| private readonly Func<StringBuilder, bool> isMatchSB; |
| private readonly Func<string, bool> isMatchStr; |
| private readonly Func<ICharSequence, bool> isMatchCS; |
| |
| public RPatternHelper(Func<StringBuilder, bool> isMatchSB, Func<string, bool> isMatchStr, Func<ICharSequence, bool> isMatchCS) |
| { |
| this.isMatchSB = isMatchSB; |
| this.isMatchStr = isMatchStr; |
| this.isMatchCS = isMatchCS; |
| } |
| |
| public bool IsMatch(StringBuilder input) |
| { |
| return isMatchSB(input); |
| } |
| |
| public bool IsMatch(string input) |
| { |
| return isMatchStr(input); |
| } |
| |
| public bool IsMatch(ICharSequence input) |
| { |
| return isMatchCS(input); |
| } |
| } |
| |
| /// <summary> |
| /// Attempts to compile the regex into direct string ops, falling back to <see cref="Regex"/> and <see cref="Match"/> in the worst case. |
| /// </summary> |
| /// <param name="regex">The regular expression to compile.</param> |
| /// <returns>An RPattern that will match this regex.</returns> |
| private static IRPattern GetPattern(string regex) |
| { |
| bool startsWith = regex.StartsWith("^", StringComparison.Ordinal); |
| bool endsWith = regex.EndsWith("$", StringComparison.Ordinal); |
| string content = regex.Substring(startsWith ? 1 : 0, (endsWith ? regex.Length - 1 : regex.Length) - (startsWith ? 1 : 0)); |
| bool boxes = content.Contains("["); |
| |
| if (!boxes) |
| { |
| if (startsWith && endsWith) |
| { |
| // exact match |
| if (content.Length == 0) |
| { |
| // empty |
| return new RPatternHelper(isMatchSB: (input) => |
| { |
| return input.Length == 0; |
| }, isMatchStr: (input) => |
| { |
| return input.Length == 0; |
| }, isMatchCS: (input) => |
| { |
| return input.Length == 0; |
| }); |
| } |
| else |
| { |
| |
| return new RPatternHelper(isMatchSB: (input) => |
| { |
| return input.Equals(content); |
| }, isMatchStr: (input) => |
| { |
| return input.Equals(content); |
| }, isMatchCS: (input) => |
| { |
| return input.Equals(content); |
| }); |
| } |
| } |
| else if ((startsWith || endsWith) && content.Length == 0) |
| { |
| // matches every string |
| return ALL_STRINGS_RMATCHER; |
| } |
| else if (startsWith) |
| { |
| // matches from start |
| return new RPatternHelper(isMatchSB: (input) => |
| { |
| return StartsWith(input, content); |
| }, isMatchStr: (input) => |
| { |
| return StartsWith(input, content); |
| }, isMatchCS: (input) => |
| { |
| return StartsWith(input, content); |
| }); |
| |
| } |
| else if (endsWith) |
| { |
| // matches from start |
| return new RPatternHelper(isMatchSB: (input) => |
| { |
| return EndsWith(input, content); |
| }, isMatchStr: (input) => |
| { |
| return EndsWith(input, content); |
| }, isMatchCS: (input) => |
| { |
| return EndsWith(input, content); |
| }); |
| } |
| } |
| else |
| { |
| bool startsWithBox = content.StartsWith("[", StringComparison.Ordinal); |
| bool endsWithBox = content.EndsWith("]", StringComparison.Ordinal); |
| |
| if (startsWithBox && endsWithBox) |
| { |
| string boxContent = content.Substring(1, (content.Length - 1) - 1); |
| if (!boxContent.Contains("[")) |
| { |
| // box containing alternatives |
| bool negate = boxContent.StartsWith("^", StringComparison.Ordinal); |
| if (negate) |
| { |
| boxContent = boxContent.Substring(1); |
| } |
| string bContent = boxContent; |
| bool shouldMatch = !negate; |
| |
| if (startsWith && endsWith) |
| { |
| // exact match |
| return new RPatternHelper(isMatchSB: (input) => |
| { |
| return input.Length == 1 && Contains(bContent, input[0]) == shouldMatch; |
| }, isMatchStr: (input) => |
| { |
| return input.Length == 1 && Contains(bContent, input[0]) == shouldMatch; |
| }, isMatchCS: (input) => |
| { |
| return input.Length == 1 && Contains(bContent, input[0]) == shouldMatch; |
| }); |
| } |
| else if (startsWith) |
| { |
| // first char |
| return new RPatternHelper(isMatchSB: (input) => |
| { |
| return input.Length > 0 && Contains(bContent, input[0]) == shouldMatch; |
| }, isMatchStr: (input) => |
| { |
| return input.Length > 0 && Contains(bContent, input[0]) == shouldMatch; |
| }, isMatchCS: (input) => |
| { |
| return input.Length > 0 && Contains(bContent, input[0]) == shouldMatch; |
| }); |
| } |
| else if (endsWith) |
| { |
| // last char |
| return new RPatternHelper(isMatchSB: (input) => |
| { |
| return input.Length > 0 && Contains(bContent, input[input.Length - 1]) == shouldMatch; |
| }, isMatchStr: (input) => |
| { |
| return input.Length > 0 && Contains(bContent, input[input.Length - 1]) == shouldMatch; |
| }, isMatchCS: (input) => |
| { |
| return input.Length > 0 && Contains(bContent, input[input.Length - 1]) == shouldMatch; |
| }); |
| } |
| } |
| } |
| } |
| Regex pattern = new Regex(regex, RegexOptions.Compiled); |
| |
| return new RPatternHelper(isMatchSB: (input) => |
| { |
| Match matcher = pattern.Match(input.ToString()); |
| return matcher.Success; |
| }, isMatchStr: (input) => |
| { |
| Match matcher = pattern.Match(input); |
| return matcher.Success; |
| }, isMatchCS: (input) => |
| { |
| Match matcher = pattern.Match(input.ToString()); |
| return matcher.Success; |
| }); |
| } |
| |
| private static bool StartsWith(ICharSequence input, string prefix) |
| { |
| if (prefix.Length > input.Length) |
| { |
| return false; |
| } |
| for (int i = 0; i < prefix.Length; i++) |
| { |
| if (input[i] != prefix[i]) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private static bool StartsWith(string input, string prefix) |
| { |
| if (prefix.Length > input.Length) |
| { |
| return false; |
| } |
| for (int i = 0; i < prefix.Length; i++) |
| { |
| if (input[i] != prefix[i]) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private static bool StartsWith(StringBuilder input, string prefix) |
| { |
| if (prefix.Length > input.Length) |
| { |
| return false; |
| } |
| for (int i = 0; i < prefix.Length; i++) |
| { |
| if (input[i] != prefix[i]) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private static string StripQuotes(string str) |
| { |
| if (str.StartsWith(DOUBLE_QUOTE, StringComparison.Ordinal)) |
| { |
| str = str.Substring(1); |
| } |
| |
| if (str.EndsWith(DOUBLE_QUOTE, StringComparison.Ordinal)) |
| { |
| str = str.Substring(0, str.Length - 1); |
| } |
| |
| return str; |
| } |
| |
| private readonly IRPattern lContext; |
| |
| private readonly string pattern; |
| |
| private readonly IPhonemeExpr phoneme; |
| |
| private readonly IRPattern rContext; |
| |
| /// <summary> |
| /// Creates a new rule. |
| /// </summary> |
| /// <param name="pattern">The pattern.</param> |
| /// <param name="lContext">The left context.</param> |
| /// <param name="rContext">The right context.</param> |
| /// <param name="phoneme">The resulting phoneme.</param> |
| public Rule(string pattern, string lContext, string rContext, IPhonemeExpr phoneme) |
| { |
| this.pattern = pattern; |
| this.lContext = GetPattern(lContext + "$"); |
| this.rContext = GetPattern("^" + rContext); |
| this.phoneme = phoneme; |
| } |
| |
| /// <summary> |
| /// Gets the left context pattern. This is a regular expression that must match to the left of the pattern. |
| /// </summary> |
| public virtual IRPattern LContext => lContext; |
| |
| /// <summary> |
| /// Gets the pattern. This is a string-literal that must exactly match. |
| /// </summary> |
| public virtual string Pattern => pattern; |
| |
| /// <summary> |
| /// Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. |
| /// </summary> |
| public virtual IPhonemeExpr Phoneme => phoneme; |
| |
| /// <summary> |
| /// Gets the right context pattern. This is a regular expression that must match to the right of the pattern. |
| /// </summary> |
| public virtual IRPattern RContext => rContext; |
| |
| /// <summary> |
| /// Decides if the pattern and context match the input starting at a position. It is a match if the |
| /// <see cref="LContext"/> matches <paramref name="input"/> up to <paramref name="i"/>, <see cref="Pattern"/> matches at <paramref name="i"/> and |
| /// <see cref="RContext"/> matches from the end of the match of <see cref="Pattern"/> to the end of <paramref name="input"/>. |
| /// </summary> |
| /// <param name="input">The input <see cref="ICharSequence"/>.</param> |
| /// <param name="i">The int position within the input.</param> |
| /// <returns><c>true</c> if the pattern and left/right context match, <c>false</c> otherwise.</returns> |
| public virtual bool PatternAndContextMatches(ICharSequence input, int i) |
| { |
| if (i < 0) |
| { |
| throw new ArgumentOutOfRangeException("Can not match pattern at negative indexes"); |
| } |
| |
| int patternLength = this.pattern.Length; |
| int ipl = i + patternLength; |
| |
| if (ipl > input.Length) |
| { |
| // not enough room for the pattern to match |
| return false; |
| } |
| |
| // evaluate the pattern, left context and right context |
| // fail early if any of the evaluations is not successful |
| if (!input.Subsequence(i, ipl - i).Equals(this.pattern)) // LUCENENET: Corrected 2nd Subseqence parameter |
| { |
| return false; |
| } |
| else if (!this.rContext.IsMatch(input.Subsequence(ipl, input.Length - ipl))) // LUCENENET: Corrected 2nd Subseqence parameter |
| { |
| return false; |
| } |
| return this.lContext.IsMatch(input.Subsequence(0, i - 0)); // LUCENENET: Corrected 2nd Subseqence parameter |
| } |
| |
| /// <summary> |
| /// Decides if the pattern and context match the input starting at a position. It is a match if the |
| /// <see cref="LContext"/> matches <paramref name="input"/> up to <paramref name="i"/>, <see cref="Pattern"/> matches at <paramref name="i"/> and |
| /// <see cref="RContext"/> matches from the end of the match of <see cref="Pattern"/> to the end of <paramref name="input"/>. |
| /// </summary> |
| /// <param name="input">The input <see cref="string"/>.</param> |
| /// <param name="i">The int position within the input.</param> |
| /// <returns><c>true</c> if the pattern and left/right context match, <c>false</c> otherwise.</returns> |
| // LUCENENET specific |
| public virtual bool PatternAndContextMatches(string input, int i) |
| { |
| if (i < 0) |
| { |
| throw new ArgumentOutOfRangeException("Can not match pattern at negative indexes"); |
| } |
| |
| int patternLength = this.pattern.Length; |
| int ipl = i + patternLength; |
| |
| if (ipl > input.Length) |
| { |
| // not enough room for the pattern to match |
| return false; |
| } |
| |
| // evaluate the pattern, left context and right context |
| // fail early if any of the evaluations is not successful |
| if (!input.Substring(i, (ipl - i)).Equals(this.pattern, StringComparison.Ordinal)) |
| { |
| return false; |
| } |
| else if (!this.rContext.IsMatch(input.Substring(ipl, (input.Length - ipl)))) |
| { |
| return false; |
| } |
| return this.lContext.IsMatch(input.Substring(0, (i - 0))); |
| } |
| |
| /// <summary> |
| /// Decides if the pattern and context match the input starting at a position. It is a match if the |
| /// <see cref="LContext"/> matches <paramref name="input"/> up to <paramref name="i"/>, <see cref="Pattern"/> matches at <paramref name="i"/> and |
| /// <see cref="RContext"/> matches from the end of the match of <see cref="Pattern"/> to the end of <paramref name="input"/>. |
| /// </summary> |
| /// <param name="input">The input <see cref="StringBuilder"/>.</param> |
| /// <param name="i">The int position within the input.</param> |
| /// <returns><c>true</c> if the pattern and left/right context match, <c>false</c> otherwise.</returns> |
| // LUCENENET specific |
| public virtual bool PatternAndContextMatches(StringBuilder input, int i) |
| { |
| if (i < 0) |
| { |
| throw new ArgumentOutOfRangeException("Can not match pattern at negative indexes"); |
| } |
| |
| int patternLength = this.pattern.Length; |
| int ipl = i + patternLength; |
| |
| if (ipl > input.Length) |
| { |
| // not enough room for the pattern to match |
| return false; |
| } |
| |
| // evaluate the pattern, left context and right context |
| // fail early if any of the evaluations is not successful |
| if (!input.ToString(i, (ipl - i)).Equals(this.pattern, StringComparison.Ordinal)) |
| { |
| return false; |
| } |
| else if (!this.rContext.IsMatch(input.ToString(ipl, (input.Length - ipl)))) |
| { |
| return false; |
| } |
| return this.lContext.IsMatch(input.ToString(0, (i - 0))); |
| } |
| |
| } |
| |
| public sealed class Phoneme : IPhonemeExpr |
| { |
| private class PhonemeComparer : IComparer<Phoneme> |
| { |
| public int Compare(Phoneme o1, Phoneme o2) |
| { |
| for (int i = 0; i < o1.phonemeText.Length; i++) |
| { |
| if (i >= o2.phonemeText.Length) |
| { |
| return +1; |
| } |
| int c = o1.phonemeText[i] - o2.phonemeText[i]; |
| if (c != 0) |
| { |
| return c; |
| } |
| } |
| |
| if (o1.phonemeText.Length < o2.phonemeText.Length) |
| { |
| return -1; |
| } |
| |
| return 0; |
| } |
| } |
| |
| public static readonly IComparer<Phoneme> COMPARER = new PhonemeComparer(); |
| private readonly StringBuilder phonemeText; |
| private readonly LanguageSet languages; |
| |
| public Phoneme(string phonemeText, LanguageSet languages) |
| { |
| this.phonemeText = new StringBuilder(phonemeText); |
| this.languages = languages; |
| } |
| |
| public Phoneme(StringBuilder phonemeText, LanguageSet languages) |
| { |
| this.phonemeText = new StringBuilder(phonemeText.ToString()); |
| this.languages = languages; |
| } |
| |
| public Phoneme(ICharSequence phonemeText, LanguageSet languages) |
| { |
| this.phonemeText = new StringBuilder(phonemeText.ToString()); |
| this.languages = languages; |
| } |
| |
| public Phoneme(Phoneme phonemeLeft, Phoneme phonemeRight) |
| : this(phonemeLeft.phonemeText, phonemeLeft.languages) |
| { |
| this.phonemeText.Append(phonemeRight.phonemeText); |
| } |
| |
| public Phoneme(Phoneme phonemeLeft, Phoneme phonemeRight, LanguageSet languages) |
| : this(phonemeLeft.phonemeText, languages) |
| { |
| this.phonemeText.Append(phonemeRight.phonemeText); |
| } |
| |
| public Phoneme Append(string str) |
| { |
| this.phonemeText.Append(str); |
| return this; |
| } |
| |
| public LanguageSet Languages => languages; |
| |
| public IList<Phoneme> Phonemes => new Phoneme[] { this }; |
| |
| public string GetPhonemeText() |
| { |
| return this.phonemeText.ToString(); |
| } |
| |
| [Obsolete("since 1.9")] |
| public Phoneme Join(Phoneme right) |
| { |
| return new Phoneme(this.phonemeText.ToString() + right.phonemeText.ToString(), |
| this.languages.RestrictTo(right.Languages)); |
| } |
| } |
| |
| public interface IPhonemeExpr |
| { |
| IList<Phoneme> Phonemes { get; } |
| } |
| |
| public sealed class PhonemeList : IPhonemeExpr |
| { |
| public PhonemeList(IList<Phoneme> phonemes) |
| { |
| this.Phonemes = phonemes; |
| } |
| |
| public IList<Phoneme> Phonemes { get; private set; } |
| } |
| |
| /// <summary> |
| /// A minimal wrapper around the functionality of <see cref="Rule"/> Pattern that we use, to allow for alternate implementations. |
| /// </summary> |
| public interface IRPattern |
| { |
| bool IsMatch(ICharSequence input); |
| bool IsMatch(string input); |
| bool IsMatch(StringBuilder input); |
| } |
| } |