| using J2N.IO; |
| using Lucene.Net.Analysis.Util; |
| using Lucene.Net.Benchmarks.ByTask.Utils; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.IO; |
| using System.Reflection; |
| using System.Text.RegularExpressions; |
| |
| namespace Lucene.Net.Benchmarks.ByTask.Tasks |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Analyzer factory construction task. The name given to the constructed factory may |
| /// be given to <see cref="NewAnalyzerTask"/>, which will call <see cref="AnalyzerFactory.Create()"/>. |
| /// </summary> |
| /// <remarks> |
| /// Params are in the form argname:argvalue or argname:"argvalue" or argname:'argvalue'; |
| /// use backslashes to escape '"' or "'" inside a quoted value when it's used as the enclosing |
| /// quotation mark, |
| /// <para/> |
| /// Specify params in a comma separated list of the following, in order: |
| /// <list type="number"> |
| /// <item><description> |
| /// <list type="bullet"> |
| /// <item><description><b>Required</b>: <c>name:<i>analyzer-factory-name</i></c></description></item> |
| /// <item><description>Optional: <c>positionIncrementGap:<i>int value</i></c> (default: 0)</description></item> |
| /// <item><description>Optional: <c>offsetGap:<i>int value</i></c> (default: 1)</description></item> |
| /// </list> |
| /// </description></item> |
| /// <item><description>zero or more CharFilterFactory's, followed by</description></item> |
| /// <item><description>exactly one TokenizerFactory, followed by</description></item> |
| /// <item><description>zero or more TokenFilterFactory's</description></item> |
| /// </list> |
| /// <para/> |
| /// Each component analysis factory map specify <tt>luceneMatchVersion</tt> (defaults to |
| /// <see cref="LuceneVersion.LUCENE_CURRENT"/>) and any of the args understood by the specified |
| /// *Factory class, in the above-describe param format. |
| /// <para/> |
| /// Example: |
| /// <code> |
| /// -AnalyzerFactory(name:'strip html, fold to ascii, whitespace tokenize, max 10k tokens', |
| /// positionIncrementGap:100, |
| /// HTMLStripCharFilter, |
| /// MappingCharFilter(mapping:'mapping-FoldToASCII.txt'), |
| /// WhitespaceTokenizer(luceneMatchVersion:LUCENE_43), |
| /// TokenLimitFilter(maxTokenCount:10000, consumeAllTokens:false)) |
| /// [...] |
| /// -NewAnalyzer('strip html, fold to ascii, whitespace tokenize, max 10k tokens') |
| /// </code> |
| /// <para/> |
| /// <see cref="AnalyzerFactory"/> will direct analysis component factories to look for resources |
| /// under the directory specified in the "work.dir" property. |
| /// </remarks> |
| public class AnalyzerFactoryTask : PerfTask |
| { |
| private const string LUCENE_ANALYSIS_PACKAGE_PREFIX = "Lucene.Net.Analysis."; |
| private static readonly Regex ANALYSIS_COMPONENT_SUFFIX_PATTERN |
| = new Regex("(?s:(?:(?:Token|Char)?Filter|Tokenizer)(?:Factory)?)$", RegexOptions.Compiled); |
| private static readonly Regex TRAILING_DOT_ZERO_PATTERN = new Regex(@"\.0$", RegexOptions.Compiled); |
| |
| private enum ArgType { ANALYZER_ARG, ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER, TOKENFILTER } |
| |
| private string factoryName = null; |
| private int? positionIncrementGap = null; |
| private int? offsetGap = null; |
| private readonly IList<CharFilterFactory> charFilterFactories = new List<CharFilterFactory>(); |
| private TokenizerFactory tokenizerFactory = null; |
| private readonly IList<TokenFilterFactory> tokenFilterFactories = new List<TokenFilterFactory>(); |
| |
| public AnalyzerFactoryTask(PerfRunData runData) |
| : base(runData) |
| { |
| } |
| |
| public override int DoLogic() |
| { |
| return 1; |
| } |
| |
| /// <summary> |
| /// Sets the params. |
| /// Analysis component factory names may optionally include the "Factory" suffix. |
| /// </summary> |
| /// <param name="params"> |
| /// analysis pipeline specification: name, (optional) positionIncrementGap, |
| /// (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory, |
| /// and 0+ TokenFilterFactory's |
| /// </param> |
| public override void SetParams(string @params) |
| { |
| base.SetParams(@params); |
| ArgType expectedArgType = ArgType.ANALYZER_ARG; |
| |
| StreamTokenizer stok = new StreamTokenizer(new StringReader(@params)); |
| stok.CommentChar('#'); |
| stok.QuoteChar('"'); |
| stok.QuoteChar('\''); |
| stok.EndOfLineIsSignificant = false; |
| stok.OrdinaryChar('('); |
| stok.OrdinaryChar(')'); |
| stok.OrdinaryChar(':'); |
| stok.OrdinaryChar(','); |
| try |
| { |
| while (stok.NextToken() != StreamTokenizer.TokenType_EndOfStream) |
| { |
| switch (stok.TokenType) |
| { |
| case ',': |
| { |
| // Do nothing |
| break; |
| } |
| case StreamTokenizer.TokenType_Word: |
| { |
| if (expectedArgType.Equals(ArgType.ANALYZER_ARG)) |
| { |
| string argName = stok.StringValue; |
| if (!argName.Equals("name", StringComparison.OrdinalIgnoreCase) |
| && !argName.Equals("positionIncrementGap", StringComparison.OrdinalIgnoreCase) |
| && !argName.Equals("offsetGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Missing 'name' param to AnalyzerFactory: '" + @params + "'"); |
| } |
| stok.NextToken(); |
| if (stok.TokenType != ':') |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory"); |
| } |
| |
| stok.NextToken(); |
| string argValue = stok.StringValue; |
| switch (stok.TokenType) |
| { |
| case StreamTokenizer.TokenType_Number: |
| { |
| argValue = stok.NumberValue.ToString(CultureInfo.InvariantCulture); |
| // Drop the ".0" from numbers, for integer arguments |
| argValue = TRAILING_DOT_ZERO_PATTERN.Replace(argValue, "", 1); |
| // Intentional fallthrough |
| |
| if (argName.Equals("name", StringComparison.OrdinalIgnoreCase)) |
| { |
| factoryName = argValue; |
| expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER; |
| } |
| else |
| { |
| int intArgValue = 0; |
| try |
| { |
| intArgValue = int.Parse(argValue, CultureInfo.InvariantCulture); |
| } |
| catch (FormatException e) |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Exception parsing " + argName + " value '" + argValue + "'", e); |
| } |
| if (argName.Equals("positionIncrementGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| positionIncrementGap = intArgValue; |
| } |
| else if (argName.Equals("offsetGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| offsetGap = intArgValue; |
| } |
| } |
| break; |
| } |
| case '"': |
| case '\'': |
| case StreamTokenizer.TokenType_Word: |
| { |
| if (argName.Equals("name", StringComparison.OrdinalIgnoreCase)) |
| { |
| factoryName = argValue; |
| expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER; |
| } |
| else |
| { |
| int intArgValue = 0; |
| try |
| { |
| intArgValue = int.Parse(argValue, CultureInfo.InvariantCulture); |
| } |
| catch (FormatException e) |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Exception parsing " + argName + " value '" + argValue + "'", e); |
| } |
| if (argName.Equals("positionIncrementGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| positionIncrementGap = intArgValue; |
| } |
| else if (argName.Equals("offsetGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| offsetGap = intArgValue; |
| } |
| } |
| break; |
| } |
| case StreamTokenizer.TokenType_EndOfStream: |
| { |
| throw new Exception("Unexpected EOF: " + stok.ToString()); |
| } |
| default: |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Unexpected token: " + stok.ToString()); |
| } |
| } |
| } |
| else if (expectedArgType.Equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) |
| { |
| string argName = stok.StringValue; |
| |
| if (argName.Equals("positionIncrementGap", StringComparison.OrdinalIgnoreCase) |
| || argName.Equals("offsetGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| stok.NextToken(); |
| if (stok.TokenType != ':') |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory"); |
| } |
| stok.NextToken(); |
| int intArgValue = (int)stok.NumberValue; |
| switch (stok.TokenType) |
| { |
| case '"': |
| case '\'': |
| case StreamTokenizer.TokenType_Word: |
| { |
| intArgValue = 0; |
| try |
| { |
| intArgValue = int.Parse(stok.StringValue.Trim(), CultureInfo.InvariantCulture); |
| } |
| catch (FormatException e) |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Exception parsing " + argName + " value '" + stok.StringValue + "'", e); |
| } |
| // Intentional fall-through |
| |
| if (argName.Equals("positionIncrementGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| positionIncrementGap = intArgValue; |
| } |
| else if (argName.Equals("offsetGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| offsetGap = intArgValue; |
| } |
| break; |
| } |
| case StreamTokenizer.TokenType_Number: |
| { |
| if (argName.Equals("positionIncrementGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| positionIncrementGap = intArgValue; |
| } |
| else if (argName.Equals("offsetGap", StringComparison.OrdinalIgnoreCase)) |
| { |
| offsetGap = intArgValue; |
| } |
| break; |
| } |
| case StreamTokenizer.TokenType_EndOfStream: |
| { |
| throw new Exception("Unexpected EOF: " + stok.ToString()); |
| } |
| default: |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Unexpected token: " + stok.ToString()); |
| } |
| } |
| break; |
| } |
| try |
| { |
| Type clazz; |
| clazz = LookupAnalysisClass(argName, typeof(CharFilterFactory)); |
| CreateAnalysisPipelineComponent(stok, clazz); |
| } |
| catch (ArgumentException /*e*/) |
| { |
| try |
| { |
| Type clazz; |
| clazz = LookupAnalysisClass(argName, typeof(TokenizerFactory)); |
| CreateAnalysisPipelineComponent(stok, clazz); |
| expectedArgType = ArgType.TOKENFILTER; |
| } |
| catch (ArgumentException e2) |
| { |
| throw new Exception("Line #" + GetLineNumber(stok) + ": Can't find class '" |
| + argName + "' as CharFilterFactory or TokenizerFactory", e2); |
| } |
| } |
| } |
| else |
| { // expectedArgType = ArgType.TOKENFILTER |
| string className = stok.StringValue; |
| Type clazz; |
| try |
| { |
| clazz = LookupAnalysisClass(className, typeof(TokenFilterFactory)); |
| } |
| catch (ArgumentException e) |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Can't find class '" + className + "' as TokenFilterFactory", e); |
| } |
| CreateAnalysisPipelineComponent(stok, clazz); |
| } |
| break; |
| } |
| default: |
| { |
| throw new Exception("Line #" + GetLineNumber(stok) + ": Unexpected token: " + stok.ToString()); |
| } |
| } |
| } |
| } |
| catch (Exception e) |
| { |
| if (e.Message.StartsWith("Line #", StringComparison.Ordinal)) |
| { |
| throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) |
| } |
| else |
| { |
| throw new Exception("Line #" + GetLineNumber(stok) + ": ", e); |
| } |
| } |
| |
| AnalyzerFactory analyzerFactory = new AnalyzerFactory(charFilterFactories, tokenizerFactory, tokenFilterFactories) |
| { |
| PositionIncrementGap = positionIncrementGap, |
| OffsetGap = offsetGap |
| }; |
| RunData.AnalyzerFactories[factoryName] = analyzerFactory; |
| } |
| |
| /// <summary> |
| /// Instantiates the given analysis factory class after pulling params from |
| /// the given stream tokenizer, then stores the result in the appropriate |
| /// pipeline component list. |
| /// </summary> |
| /// <param name="stok">Stream tokenizer from which to draw analysis factory params.</param> |
| /// <param name="clazz">Analysis factory class to instantiate.</param> |
| private void CreateAnalysisPipelineComponent(StreamTokenizer stok, Type clazz) |
| { |
| IDictionary<string, string> argMap = new Dictionary<string, string>(); |
| bool parenthetical = false; |
| try |
| { |
| while (stok.NextToken() != StreamTokenizer.TokenType_EndOfStream) |
| { |
| switch (stok.TokenType) |
| { |
| case ',': |
| { |
| if (parenthetical) |
| { |
| // Do nothing |
| break; |
| } |
| else |
| { |
| // Finished reading this analysis factory configuration |
| goto WHILE_LOOP_BREAK; |
| } |
| } |
| case '(': |
| { |
| if (parenthetical) |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Unexpected opening parenthesis."); |
| } |
| parenthetical = true; |
| break; |
| } |
| case ')': |
| { |
| if (parenthetical) |
| { |
| parenthetical = false; |
| } |
| else |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Unexpected closing parenthesis."); |
| } |
| break; |
| } |
| case StreamTokenizer.TokenType_Word: |
| { |
| if (!parenthetical) |
| { |
| throw new Exception("Line #" + GetLineNumber(stok) + ": Unexpected token '" + stok.StringValue + "'"); |
| } |
| string argName = stok.StringValue; |
| stok.NextToken(); |
| if (stok.TokenType != ':') |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Missing ':' after '" + argName + "' param to " + clazz.Name); |
| } |
| stok.NextToken(); |
| string argValue = stok.StringValue; |
| switch (stok.TokenType) |
| { |
| case StreamTokenizer.TokenType_Number: |
| { |
| argValue = stok.NumberValue.ToString(CultureInfo.InvariantCulture); |
| // Drop the ".0" from numbers, for integer arguments |
| argValue = TRAILING_DOT_ZERO_PATTERN.Replace(argValue, "", 1); |
| // Intentional fall-through |
| argMap[argName] = argValue; |
| break; |
| } |
| case '"': |
| case '\'': |
| case StreamTokenizer.TokenType_Word: |
| { |
| argMap[argName] = argValue; |
| break; |
| } |
| case StreamTokenizer.TokenType_EndOfStream: |
| { |
| throw new Exception("Unexpected EOF: " + stok.ToString()); |
| } |
| default: |
| { |
| throw new Exception |
| ("Line #" + GetLineNumber(stok) + ": Unexpected token: " + stok.ToString()); |
| } |
| } |
| break; |
| } |
| } |
| } |
| WHILE_LOOP_BREAK: { } |
| |
| if (!argMap.ContainsKey("luceneMatchVersion")) |
| { |
| #pragma warning disable 612, 618 |
| argMap["luceneMatchVersion"] = LuceneVersion.LUCENE_CURRENT.ToString(); |
| #pragma warning restore 612, 618 |
| } |
| AbstractAnalysisFactory instance; |
| try |
| { |
| instance = (AbstractAnalysisFactory)Activator.CreateInstance(clazz, argMap); |
| } |
| catch (Exception e) |
| { |
| throw new Exception("Line #" + GetLineNumber(stok) + ": ", e); |
| } |
| if (instance is IResourceLoaderAware resourceLoaderAware) |
| { |
| DirectoryInfo baseDir = new DirectoryInfo(RunData.Config.Get("work.dir", "work")); |
| resourceLoaderAware.Inform(new FilesystemResourceLoader(baseDir)); |
| } |
| if (typeof(CharFilterFactory).IsAssignableFrom(clazz)) |
| { |
| charFilterFactories.Add((CharFilterFactory)instance); |
| } |
| else if (typeof(TokenizerFactory).IsAssignableFrom(clazz)) |
| { |
| tokenizerFactory = (TokenizerFactory)instance; |
| } |
| else if (typeof(TokenFilterFactory).IsAssignableFrom(clazz)) |
| { |
| tokenFilterFactories.Add((TokenFilterFactory)instance); |
| } |
| } |
| catch (Exception e) |
| { |
| if (e.Message.StartsWith("Line #", StringComparison.Ordinal)) |
| { |
| throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) |
| } |
| else |
| { |
| throw new Exception("Line #" + GetLineNumber(stok) + ": ", e); |
| } |
| } |
| } |
| |
| /// <summary> |
| /// This method looks up a class with its fully qualified name (FQN), or a short-name |
| /// class-simplename, or with a package suffix, assuming "Lucene.Net.Analysis." |
| /// as the namespace prefix (e.g. "standard.ClassicTokenizerFactory" -> |
| /// "Lucene.Net.Analysis.Standard.ClassicTokenizerFactory"). |
| /// </summary> |
| /// <remarks> |
| /// If <paramref name="className"/> contains a period, the class is first looked up as-is, assuming that it |
| /// is an FQN. If this fails, lookup is retried after prepending the Lucene analysis |
| /// package prefix to the class name. |
| /// <para/> |
| /// If <paramref name="className"/> does not contain a period, the analysis SPI *Factory.LookupClass() |
| /// methods are used to find the class. |
| /// </remarks> |
| /// <param name="className">The namespace qualified name or the short name of the class.</param> |
| /// <param name="expectedType">The superclass <paramref name="className"/> is expected to extend. </param> |
| /// <returns>The loaded type.</returns> |
| /// <exception cref="TypeLoadException">If lookup fails.</exception> |
| public virtual Type LookupAnalysisClass(string className, Type expectedType) |
| { |
| if (className.Contains(".")) |
| { |
| // First, try className == FQN |
| Type result = Type.GetType(className); |
| if (result == null) |
| { |
| // Second, retry lookup after prepending the Lucene analysis package prefix |
| result = Type.GetType(LUCENE_ANALYSIS_PACKAGE_PREFIX + className); |
| |
| if (result == null) |
| { |
| throw new TypeLoadException("Can't find class '" + className |
| + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'"); |
| } |
| } |
| return result; |
| } |
| // No dot - use analysis SPI lookup |
| string analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.Replace(className, "", 1); |
| if (typeof(CharFilterFactory).IsAssignableFrom(expectedType)) |
| { |
| return CharFilterFactory.LookupClass(analysisComponentName); |
| } |
| else if (typeof(TokenizerFactory).IsAssignableFrom(expectedType)) |
| { |
| return TokenizerFactory.LookupClass(analysisComponentName); |
| } |
| else if (typeof(TokenFilterFactory).IsAssignableFrom(expectedType)) |
| { |
| return TokenFilterFactory.LookupClass(analysisComponentName); |
| } |
| |
| throw new TypeLoadException("Can't find class '" + className + "'"); |
| } |
| |
| /// <seealso cref="PerfTask.SupportsParams"/> |
| public override bool SupportsParams => true; |
| |
| /// <summary>Returns the current line in the algorithm file</summary> |
| public virtual int GetLineNumber(StreamTokenizer stok) |
| { |
| return AlgLineNum + stok.LineNumber; |
| } |
| } |
| } |