| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.benchmark.byTask.tasks; |
| |
| import java.io.StreamTokenizer; |
| import java.io.StringReader; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.regex.Pattern; |
| import org.apache.lucene.analysis.AbstractAnalysisFactory; |
| import org.apache.lucene.analysis.CharFilterFactory; |
| import org.apache.lucene.analysis.TokenFilterFactory; |
| import org.apache.lucene.analysis.TokenizerFactory; |
| import org.apache.lucene.analysis.util.FilesystemResourceLoader; |
| import org.apache.lucene.benchmark.byTask.PerfRunData; |
| import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory; |
| import org.apache.lucene.util.ResourceLoaderAware; |
| import org.apache.lucene.util.Version; |
| |
| /** |
| * Analyzer factory construction task. The name given to the constructed factory may be given to |
| * NewAnalyzerTask, which will call AnalyzerFactory.create(). |
| * |
| * <p>Params are in the form argname:argvalue or argname:"argvalue" or argname:'argvalue'; use |
| * backslashes to escape '"' or "'" inside a quoted value when it's used as the enclosing quotation |
| * mark, |
| * |
| * <p>Specify params in a comma separated list of the following, in order: |
| * |
| * <ol> |
| * <li>Analyzer args: |
| * <ul> |
| * <li><b>Required</b>: <code>name:<i>analyzer-factory-name</i></code> |
| * <li>Optional: <code>positionIncrementGap:<i>int value</i></code> (default: 0) |
| * <li>Optional: <code>offsetGap:<i>int value</i></code> (default: 1) |
| * </ul> |
| * <li>zero or more CharFilterFactory's, followed by |
| * <li>exactly one TokenizerFactory, followed by |
| * <li>zero or more TokenFilterFactory's |
| * </ol> |
| * |
| * Each component analysis factory may specify <code>luceneMatchVersion</code> (defaults to {@link |
| * Version#LATEST}) and any of the args understood by the specified *Factory class, in the |
| * above-describe param format. |
| * |
| * <p>Example: |
| * |
| * <pre> |
| * -AnalyzerFactory(name:'strip html, fold to ascii, whitespace tokenize, max 10k tokens', |
| * positionIncrementGap:100, |
| * HTMLStripCharFilter, |
| * MappingCharFilter(mapping:'mapping-FoldToASCII.txt'), |
| * WhitespaceTokenizer(luceneMatchVersion:LUCENE_5_0_0), |
| * TokenLimitFilter(maxTokenCount:10000, consumeAllTokens:false)) |
| * [...] |
| * -NewAnalyzer('strip html, fold to ascii, whitespace tokenize, max 10k tokens') |
| * </pre> |
| * |
| * <p>AnalyzerFactory will direct analysis component factories to look for resources under the |
| * directory specified in the "work.dir" property. |
| */ |
| public class AnalyzerFactoryTask extends PerfTask { |
| private static final String LUCENE_ANALYSIS_PACKAGE_PREFIX = "org.apache.lucene.analysis."; |
| private static final Pattern ANALYSIS_COMPONENT_SUFFIX_PATTERN = |
| Pattern.compile("(?s:(?:(?:Token|Char)?Filter|Tokenizer)(?:Factory)?)$"); |
| private static final Pattern TRAILING_DOT_ZERO_PATTERN = Pattern.compile("\\.0$"); |
| |
| private enum ArgType { |
| ANALYZER_ARG, |
| ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER, |
| TOKENFILTER |
| } |
| |
| String factoryName = null; |
| Integer positionIncrementGap = null; |
| Integer offsetGap = null; |
| private List<CharFilterFactory> charFilterFactories = new ArrayList<>(); |
| private TokenizerFactory tokenizerFactory = null; |
| private List<TokenFilterFactory> tokenFilterFactories = new ArrayList<>(); |
| |
| public AnalyzerFactoryTask(PerfRunData runData) { |
| super(runData); |
| } |
| |
| @Override |
| public int doLogic() { |
| return 1; |
| } |
| |
| /** |
| * Sets the params. Analysis component factory names may optionally include the "Factory" suffix. |
| * |
| * @param params analysis pipeline specification: name, (optional) positionIncrementGap, |
| * (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory, and 0+ |
| * TokenFilterFactory's |
| */ |
| @Override |
| @SuppressWarnings("fallthrough") |
| public void setParams(String params) { |
| super.setParams(params); |
| ArgType expectedArgType = ArgType.ANALYZER_ARG; |
| |
| final StreamTokenizer stok = new StreamTokenizer(new StringReader(params)); |
| stok.commentChar('#'); |
| stok.quoteChar('"'); |
| stok.quoteChar('\''); |
| stok.eolIsSignificant(false); |
| stok.ordinaryChar('('); |
| stok.ordinaryChar(')'); |
| stok.ordinaryChar(':'); |
| stok.ordinaryChar(','); |
| try { |
| while (stok.nextToken() != StreamTokenizer.TT_EOF) { |
| switch (stok.ttype) { |
| case ',': |
| { |
| // Do nothing |
| break; |
| } |
| case StreamTokenizer.TT_WORD: |
| { |
| if (expectedArgType.equals(ArgType.ANALYZER_ARG)) { |
| final String argName = stok.sval; |
| if (!argName.equalsIgnoreCase("name") |
| && !argName.equalsIgnoreCase("positionIncrementGap") |
| && !argName.equalsIgnoreCase("offsetGap")) { |
| throw new RuntimeException( |
| "Line #" |
| + lineno(stok) |
| + ": Missing 'name' param to AnalyzerFactory: '" |
| + params |
| + "'"); |
| } |
| stok.nextToken(); |
| if (stok.ttype != ':') { |
| throw new RuntimeException( |
| "Line #" |
| + lineno(stok) |
| + ": Missing ':' after '" |
| + argName |
| + "' param to AnalyzerFactory"); |
| } |
| |
| stok.nextToken(); |
| String argValue = stok.sval; |
| switch (stok.ttype) { |
| case StreamTokenizer.TT_NUMBER: |
| { |
| argValue = Double.toString(stok.nval); |
| // Drop the ".0" from numbers, for integer arguments |
| argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst(""); |
| // Intentional fallthrough |
| } |
| case '"': |
| case '\'': |
| case StreamTokenizer.TT_WORD: |
| { |
| if (argName.equalsIgnoreCase("name")) { |
| factoryName = argValue; |
| expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER; |
| } else { |
| int intArgValue = 0; |
| try { |
| intArgValue = Integer.parseInt(argValue); |
| } catch (NumberFormatException e) { |
| throw new RuntimeException( |
| "Line #" |
| + lineno(stok) |
| + ": Exception parsing " |
| + argName |
| + " value '" |
| + argValue |
| + "'", |
| e); |
| } |
| if (argName.equalsIgnoreCase("positionIncrementGap")) { |
| positionIncrementGap = intArgValue; |
| } else if (argName.equalsIgnoreCase("offsetGap")) { |
| offsetGap = intArgValue; |
| } |
| } |
| break; |
| } |
| case StreamTokenizer.TT_EOF: |
| { |
| throw new RuntimeException("Unexpected EOF: " + stok.toString()); |
| } |
| default: |
| { |
| throw new RuntimeException( |
| "Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); |
| } |
| } |
| } else if (expectedArgType.equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) { |
| final String argName = stok.sval; |
| |
| if (argName.equalsIgnoreCase("positionIncrementGap") |
| || argName.equalsIgnoreCase("offsetGap")) { |
| stok.nextToken(); |
| if (stok.ttype != ':') { |
| throw new RuntimeException( |
| "Line #" |
| + lineno(stok) |
| + ": Missing ':' after '" |
| + argName |
| + "' param to AnalyzerFactory"); |
| } |
| stok.nextToken(); |
| int intArgValue = (int) stok.nval; |
| switch (stok.ttype) { |
| case '"': |
| case '\'': |
| case StreamTokenizer.TT_WORD: |
| { |
| intArgValue = 0; |
| try { |
| intArgValue = Integer.parseInt(stok.sval.trim()); |
| } catch (NumberFormatException e) { |
| throw new RuntimeException( |
| "Line #" |
| + lineno(stok) |
| + ": Exception parsing " |
| + argName |
| + " value '" |
| + stok.sval |
| + "'", |
| e); |
| } |
| // Intentional fall-through |
| } |
| case StreamTokenizer.TT_NUMBER: |
| { |
| if (argName.equalsIgnoreCase("positionIncrementGap")) { |
| positionIncrementGap = intArgValue; |
| } else if (argName.equalsIgnoreCase("offsetGap")) { |
| offsetGap = intArgValue; |
| } |
| break; |
| } |
| case StreamTokenizer.TT_EOF: |
| { |
| throw new RuntimeException("Unexpected EOF: " + stok.toString()); |
| } |
| default: |
| { |
| throw new RuntimeException( |
| "Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); |
| } |
| } |
| break; |
| } |
| try { |
| final Class<? extends CharFilterFactory> clazz; |
| clazz = lookupAnalysisClass(argName, CharFilterFactory.class); |
| createAnalysisPipelineComponent(stok, clazz); |
| } catch (IllegalArgumentException e) { |
| try { |
| final Class<? extends TokenizerFactory> clazz; |
| clazz = lookupAnalysisClass(argName, TokenizerFactory.class); |
| createAnalysisPipelineComponent(stok, clazz); |
| expectedArgType = ArgType.TOKENFILTER; |
| } catch (IllegalArgumentException e2) { |
| throw new RuntimeException( |
| "Line #" |
| + lineno(stok) |
| + ": Can't find class '" |
| + argName |
| + "' as CharFilterFactory or TokenizerFactory"); |
| } |
| } |
| } else { // expectedArgType = ArgType.TOKENFILTER |
| final String className = stok.sval; |
| final Class<? extends TokenFilterFactory> clazz; |
| try { |
| clazz = lookupAnalysisClass(className, TokenFilterFactory.class); |
| } catch (IllegalArgumentException e) { |
| throw new RuntimeException( |
| "Line #" |
| + lineno(stok) |
| + ": Can't find class '" |
| + className |
| + "' as TokenFilterFactory"); |
| } |
| createAnalysisPipelineComponent(stok, clazz); |
| } |
| break; |
| } |
| default: |
| { |
| throw new RuntimeException( |
| "Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); |
| } |
| } |
| } |
| } catch (RuntimeException e) { |
| if (e.getMessage().startsWith("Line #")) { |
| throw e; |
| } else { |
| throw new RuntimeException("Line #" + lineno(stok) + ": ", e); |
| } |
| } catch (Throwable t) { |
| throw new RuntimeException("Line #" + lineno(stok) + ": ", t); |
| } |
| |
| final AnalyzerFactory analyzerFactory = |
| new AnalyzerFactory(charFilterFactories, tokenizerFactory, tokenFilterFactories); |
| analyzerFactory.setPositionIncrementGap(positionIncrementGap); |
| analyzerFactory.setOffsetGap(offsetGap); |
| getRunData().getAnalyzerFactories().put(factoryName, analyzerFactory); |
| } |
| |
| /** |
| * Instantiates the given analysis factory class after pulling params from the given stream |
| * tokenizer, then stores the result in the appropriate pipeline component list. |
| * |
| * @param stok stream tokenizer from which to draw analysis factory params |
| * @param clazz analysis factory class to instantiate |
| */ |
| @SuppressWarnings("fallthrough") |
| private void createAnalysisPipelineComponent( |
| StreamTokenizer stok, Class<? extends AbstractAnalysisFactory> clazz) { |
| Map<String, String> argMap = new HashMap<>(); |
| boolean parenthetical = false; |
| try { |
| WHILE_LOOP: |
| while (stok.nextToken() != StreamTokenizer.TT_EOF) { |
| switch (stok.ttype) { |
| case ',': |
| { |
| if (parenthetical) { |
| // Do nothing |
| break; |
| } else { |
| // Finished reading this analysis factory configuration |
| break WHILE_LOOP; |
| } |
| } |
| case '(': |
| { |
| if (parenthetical) { |
| throw new RuntimeException( |
| "Line #" + lineno(stok) + ": Unexpected opening parenthesis."); |
| } |
| parenthetical = true; |
| break; |
| } |
| case ')': |
| { |
| if (parenthetical) { |
| parenthetical = false; |
| } else { |
| throw new RuntimeException( |
| "Line #" + lineno(stok) + ": Unexpected closing parenthesis."); |
| } |
| break; |
| } |
| case StreamTokenizer.TT_WORD: |
| { |
| if (!parenthetical) { |
| throw new RuntimeException( |
| "Line #" + lineno(stok) + ": Unexpected token '" + stok.sval + "'"); |
| } |
| String argName = stok.sval; |
| stok.nextToken(); |
| if (stok.ttype != ':') { |
| throw new RuntimeException( |
| "Line #" |
| + lineno(stok) |
| + ": Missing ':' after '" |
| + argName |
| + "' param to " |
| + clazz.getSimpleName()); |
| } |
| stok.nextToken(); |
| String argValue = stok.sval; |
| switch (stok.ttype) { |
| case StreamTokenizer.TT_NUMBER: |
| { |
| argValue = Double.toString(stok.nval); |
| // Drop the ".0" from numbers, for integer arguments |
| argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst(""); |
| // Intentional fall-through |
| } |
| case '"': |
| case '\'': |
| case StreamTokenizer.TT_WORD: |
| { |
| argMap.put(argName, argValue); |
| break; |
| } |
| case StreamTokenizer.TT_EOF: |
| { |
| throw new RuntimeException("Unexpected EOF: " + stok.toString()); |
| } |
| default: |
| { |
| throw new RuntimeException( |
| "Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); |
| } |
| } |
| } |
| } |
| } |
| if (!argMap.containsKey("luceneMatchVersion")) { |
| argMap.put("luceneMatchVersion", Version.LATEST.toString()); |
| } |
| final AbstractAnalysisFactory instance; |
| try { |
| instance = clazz.getConstructor(Map.class).newInstance(argMap); |
| } catch (Exception e) { |
| throw new RuntimeException("Line #" + lineno(stok) + ": ", e); |
| } |
| if (instance instanceof ResourceLoaderAware) { |
| Path baseDir = Paths.get(getRunData().getConfig().get("work.dir", "work")); |
| if (!Files.isDirectory(baseDir)) { |
| baseDir = Paths.get("."); |
| } |
| ((ResourceLoaderAware) instance).inform(new FilesystemResourceLoader(baseDir)); |
| } |
| if (CharFilterFactory.class.isAssignableFrom(clazz)) { |
| charFilterFactories.add((CharFilterFactory) instance); |
| } else if (TokenizerFactory.class.isAssignableFrom(clazz)) { |
| tokenizerFactory = (TokenizerFactory) instance; |
| } else if (TokenFilterFactory.class.isAssignableFrom(clazz)) { |
| tokenFilterFactories.add((TokenFilterFactory) instance); |
| } |
| } catch (RuntimeException e) { |
| if (e.getMessage().startsWith("Line #")) { |
| throw (e); |
| } else { |
| throw new RuntimeException("Line #" + lineno(stok) + ": ", e); |
| } |
| } catch (Throwable t) { |
| throw new RuntimeException("Line #" + lineno(stok) + ": ", t); |
| } |
| } |
| |
| /** |
| * This method looks up a class with its fully qualified name (FQN), or a short-name |
| * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis." as the |
| * package prefix (e.g. "standard.ClassicTokenizerFactory" -> |
| * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory"). |
| * |
| * <p>If className contains a period, the class is first looked up as-is, assuming that it is an |
| * FQN. If this fails, lookup is retried after prepending the Lucene analysis package prefix to |
| * the class name. |
| * |
| * <p>If className does not contain a period, the analysis SPI *Factory.lookupClass() methods are |
| * used to find the class. |
| * |
| * @param className The name or the short name of the class. |
| * @param expectedType The superclass className is expected to extend |
| * @return the loaded class. |
| * @throws ClassNotFoundException if lookup fails |
| */ |
| public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType) |
| throws ClassNotFoundException { |
| if (className.contains(".")) { |
| try { |
| // First, try className == FQN |
| return Class.forName(className).asSubclass(expectedType); |
| } catch (ClassNotFoundException e) { |
| try { |
| // Second, retry lookup after prepending the Lucene analysis package prefix |
| return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType); |
| } catch (ClassNotFoundException e1) { |
| throw new ClassNotFoundException( |
| "Can't find class '" |
| + className |
| + "' or '" |
| + LUCENE_ANALYSIS_PACKAGE_PREFIX |
| + className |
| + "'"); |
| } |
| } |
| } |
| // No dot - use analysis SPI lookup |
| final String analysisComponentName = |
| ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst(""); |
| if (CharFilterFactory.class.isAssignableFrom(expectedType)) { |
| return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); |
| } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) { |
| return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType); |
| } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) { |
| return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); |
| } |
| |
| throw new ClassNotFoundException("Can't find class '" + className + "'"); |
| } |
| |
| /* (non-Javadoc) |
| * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() |
| */ |
| @Override |
| public boolean supportsParams() { |
| return true; |
| } |
| |
| /** Returns the current line in the algorithm file */ |
| public int lineno(StreamTokenizer stok) { |
| return getAlgLineNum() + stok.lineno(); |
| } |
| } |