| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Analysis.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| using System.Text; |
| using System.Text.RegularExpressions; |
| |
| namespace Lucene.Net.Analysis.Synonym |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Factory for <see cref="SlowSynonymFilter"/> (only used with luceneMatchVersion < 3.4) |
| /// <code> |
| /// <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100"> |
| /// <analyzer> |
| /// <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
| /// <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false" |
| /// expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/> |
| /// </analyzer> |
| /// </fieldType></code> |
| /// </summary> |
| /// @deprecated (3.4) use SynonymFilterFactory instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0 |
| [Obsolete("(3.4) use SynonymFilterFactory instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0")] |
| internal sealed class SlowSynonymFilterFactory : TokenFilterFactory, IResourceLoaderAware |
| { |
| private readonly string synonyms; |
| private readonly bool ignoreCase; |
| private readonly bool expand; |
| private readonly string tf; |
| private readonly IDictionary<string, string> tokArgs = new Dictionary<string, string>(); |
| |
| // LUCENENET: Optimized by pre-comiling regex and lazy-loading |
| private class Holder |
| { |
| public static readonly Regex TOKENIZER_FACTORY_REPLACEMENT_PATTERN = new Regex("^tokenizerFactory\\.", RegexOptions.Compiled); |
| } |
| |
| public SlowSynonymFilterFactory(IDictionary<string, string> args) |
| : base(args) |
| { |
| synonyms = Require(args, "synonyms"); |
| ignoreCase = GetBoolean(args, "ignoreCase", false); |
| expand = GetBoolean(args, "expand", true); |
| |
| tf = Get(args, "tokenizerFactory"); |
| if (tf != null) |
| { |
| AssureMatchVersion(); |
| tokArgs["luceneMatchVersion"] = LuceneMatchVersion.ToString(); |
| |
| var keys = new List<string>(args.Keys); |
| foreach (string key in keys) |
| { |
| tokArgs[Holder.TOKENIZER_FACTORY_REPLACEMENT_PATTERN.Replace(key, "")] = args[key]; |
| args.Remove(key); |
| } |
| } |
| if (args.Count > 0) |
| { |
| throw new ArgumentException(string.Format(J2N.Text.StringFormatter.CurrentCulture, "Unknown parameters: {0}", args)); |
| } |
| } |
| |
| public void Inform(IResourceLoader loader) |
| { |
| TokenizerFactory tokFactory = null; |
| if (tf != null) |
| { |
| tokFactory = LoadTokenizerFactory(loader, tf); |
| } |
| |
| IEnumerable<string> wlist = LoadRules(synonyms, loader); |
| |
| synMap = new SlowSynonymMap(ignoreCase); |
| ParseRules(wlist, synMap, "=>", ",", expand, tokFactory); |
| } |
| |
| /// <returns> a list of all rules </returns> |
| private IEnumerable<string> LoadRules(string synonyms, IResourceLoader loader) |
| { |
| List<string> wlist = null; |
| if (File.Exists(synonyms)) |
| { |
| wlist = new List<string>(GetLines(loader, synonyms)); |
| } |
| else |
| { |
| IList<string> files = SplitFileNames(synonyms); |
| wlist = new List<string>(); |
| foreach (string file in files) |
| { |
| IList<string> lines = GetLines(loader, file.Trim()); |
| wlist.AddRange(lines); |
| } |
| } |
| return wlist; |
| } |
| |
| private SlowSynonymMap synMap; |
| |
| internal static void ParseRules(IEnumerable<string> rules, SlowSynonymMap map, string mappingSep, string synSep, bool expansion, TokenizerFactory tokFactory) |
| { |
| int count = 0; |
| foreach (string rule in rules) |
| { |
| // To use regexes, we need an expression that specifies an odd number of chars. |
| // This can't really be done with string.split(), and since we need to |
| // do unescaping at some point anyway, we wouldn't be saving any effort |
| // by using regexes. |
| |
| IList<string> mapping = SplitSmart(rule, mappingSep, false); |
| |
| IList<IList<string>> source; |
| IList<IList<string>> target; |
| |
| if (mapping.Count > 2) |
| { |
| throw new ArgumentException("Invalid Synonym Rule:" + rule); |
| } |
| else if (mapping.Count == 2) |
| { |
| source = GetSynList(mapping[0], synSep, tokFactory); |
| target = GetSynList(mapping[1], synSep, tokFactory); |
| } |
| else |
| { |
| source = GetSynList(mapping[0], synSep, tokFactory); |
| if (expansion) |
| { |
| // expand to all arguments |
| target = source; |
| } |
| else |
| { |
| // reduce to first argument |
| target = new List<IList<string>>(1) |
| { |
| source[0] |
| }; |
| } |
| } |
| |
| bool includeOrig = false; |
| foreach (IList<string> fromToks in source) |
| { |
| count++; |
| foreach (IList<string> toToks in target) |
| { |
| map.Add(fromToks, SlowSynonymMap.MakeTokens(toToks), includeOrig, true); |
| } |
| } |
| } |
| } |
| |
| // a , b c , d e f => [[a],[b,c],[d,e,f]] |
| private static IList<IList<string>> GetSynList(string str, string separator, TokenizerFactory tokFactory) |
| { |
| IList<string> strList = SplitSmart(str, separator, false); |
| // now split on whitespace to get a list of token strings |
| IList<IList<string>> synList = new List<IList<string>>(); |
| foreach (string toks in strList) |
| { |
| IList<string> tokList = tokFactory == null ? SplitWS(toks, true) : SplitByTokenizer(toks, tokFactory); |
| synList.Add(tokList); |
| } |
| return synList; |
| } |
| |
| private static IList<string> SplitByTokenizer(string source, TokenizerFactory tokFactory) |
| { |
| StringReader reader = new StringReader(source); |
| TokenStream ts = LoadTokenizer(tokFactory, reader); |
| IList<string> tokList = new List<string>(); |
| try |
| { |
| ICharTermAttribute termAtt = ts.AddAttribute<ICharTermAttribute>(); |
| ts.Reset(); |
| while (ts.IncrementToken()) |
| { |
| if (termAtt.Length > 0) |
| { |
| tokList.Add(termAtt.ToString()); |
| } |
| } |
| } |
| finally |
| { |
| reader.Dispose(); |
| } |
| return tokList; |
| } |
| |
| private TokenizerFactory LoadTokenizerFactory(IResourceLoader loader, string cname) |
| { |
| Type clazz = loader.FindType(cname); |
| try |
| { |
| TokenizerFactory tokFactory = (TokenizerFactory)Activator.CreateInstance(clazz, new object[] { tokArgs }); |
| |
| if (tokFactory is IResourceLoaderAware resourceLoaderAware) |
| { |
| resourceLoaderAware.Inform(loader); |
| } |
| return tokFactory; |
| } |
| catch (Exception /*e*/) |
| { |
| throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) |
| } |
| } |
| |
| private static TokenStream LoadTokenizer(TokenizerFactory tokFactory, TextReader reader) |
| { |
| return tokFactory.Create(reader); |
| } |
| |
| public SlowSynonymMap SynonymMap => synMap; |
| |
| public override TokenStream Create(TokenStream input) |
| { |
| return new SlowSynonymFilter(input, synMap); |
| } |
| |
| public static IList<string> SplitWS(string s, bool decode) |
| { |
| List<string> lst = new List<string>(2); |
| StringBuilder sb = new StringBuilder(); |
| int pos = 0, end = s.Length; |
| while (pos < end) |
| { |
| char ch = s[pos++]; |
| if (char.IsWhiteSpace(ch)) |
| { |
| if (sb.Length > 0) |
| { |
| lst.Add(sb.ToString()); |
| sb = new StringBuilder(); |
| } |
| continue; |
| } |
| |
| if (ch == '\\') |
| { |
| if (!decode) |
| { |
| sb.Append(ch); |
| } |
| if (pos >= end) // ERROR, or let it go? |
| { |
| break; |
| } |
| ch = s[pos++]; |
| if (decode) |
| { |
| switch (ch) |
| { |
| case 'n': |
| ch = '\n'; |
| break; |
| case 't': |
| ch = '\t'; |
| break; |
| case 'r': |
| ch = '\r'; |
| break; |
| case 'b': |
| ch = '\b'; |
| break; |
| case 'f': |
| ch = '\f'; |
| break; |
| } |
| } |
| } |
| |
| sb.Append(ch); |
| } |
| |
| if (sb.Length > 0) |
| { |
| lst.Add(sb.ToString()); |
| } |
| |
| return lst; |
| } |
| |
| /// <summary> |
| /// Splits a backslash escaped string on the separator. |
| /// <para/> |
| /// Current backslash escaping supported: |
| /// <para/> \n \t \r \b \f are escaped the same as a .NET string |
| /// <para/> Other characters following a backslash are produced verbatim (\c => c) |
| /// </summary> |
| /// <param name="s"> the string to split </param> |
| /// <param name="separator"> the separator to split on </param> |
| /// <param name="decode"> decode backslash escaping </param> |
| public static IList<string> SplitSmart(string s, string separator, bool decode) |
| { |
| List<string> lst = new List<string>(2); |
| StringBuilder sb = new StringBuilder(); |
| int pos = 0, end = s.Length; |
| while (pos < end) |
| { |
| //if (s.StartsWith(separator,pos)) |
| if (s.Substring(pos).StartsWith(separator, StringComparison.Ordinal)) |
| { |
| if (sb.Length > 0) |
| { |
| lst.Add(sb.ToString()); |
| sb = new StringBuilder(); |
| } |
| pos += separator.Length; |
| continue; |
| } |
| |
| char ch = s[pos++]; |
| if (ch == '\\') |
| { |
| if (!decode) |
| { |
| sb.Append(ch); |
| } |
| if (pos >= end) // ERROR, or let it go? |
| { |
| break; |
| } |
| ch = s[pos++]; |
| if (decode) |
| { |
| switch (ch) |
| { |
| case 'n': |
| ch = '\n'; |
| break; |
| case 't': |
| ch = '\t'; |
| break; |
| case 'r': |
| ch = '\r'; |
| break; |
| case 'b': |
| ch = '\b'; |
| break; |
| case 'f': |
| ch = '\f'; |
| break; |
| } |
| } |
| } |
| |
| sb.Append(ch); |
| } |
| |
| if (sb.Length > 0) |
| { |
| lst.Add(sb.ToString()); |
| } |
| |
| return lst; |
| } |
| } |
| } |